-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Collect<Vec<u16>> from range doesn't optimize well. #43124
Comments
The interesting thing is that, once the specialization for
Compiler version
Update: It seems the slowness comes from the loop itself
|
Looking at the assembly output for this, on stable (1.18), the difference seems to be mostly that the collect case is not vectorized. On nightly things are a slightly more crazy. The function is vectorized, however there seems to be a lot of other redundant code that would ideally be optimised out. There is a function call to `alloc::allocator::Layout::repeat to calculate the size of the needed allocation which shouldn't be needed for primitive types, and checks to see if that call returned something valid. There is also some exception landing pad stuff (gcc_except_table). That probably doesn't help speed-wise. Assembly output .section .text._ZN8collect219create_with_collect17h68b620e6057f6286E,"ax",@progbits
.globl _ZN8collect219create_with_collect17h68b620e6057f6286E
.p2align 4, 0x90
.type _ZN8collect219create_with_collect17h68b620e6057f6286E,@function
_ZN8collect219create_with_collect17h68b620e6057f6286E:
.Lfunc_begin0:
.cfi_startproc
.cfi_personality 155, DW.ref.rust_eh_personality
.cfi_lsda 27, .Lexception0
pushq %r14
.Lcfi4:
.cfi_def_cfa_offset 16
pushq %rbx
.Lcfi5:
.cfi_def_cfa_offset 24
subq $72, %rsp
.Lcfi6:
.cfi_def_cfa_offset 96
.Lcfi7:
.cfi_offset %rbx, -24
.Lcfi8:
.cfi_offset %r14, -16
movq %rdi, %r14
movq $2, 48(%rsp)
xorps %xmm0, %xmm0
movups %xmm0, 56(%rsp)
movaps .LCPI2_0(%rip), %xmm0
movaps %xmm0, 32(%rsp)
.Ltmp0:
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movl $65535, %edx
callq _ZN5alloc9allocator6Layout6repeat17hfa9f21888d235374E@PLT
.Ltmp1:
cmpq $0, (%rsp)
je .LBB2_2
movq 8(%rsp), %rdi
testq %rdi, %rdi
je .LBB2_2
movq 16(%rsp), %rsi
.Ltmp2:
movq %rsp, %rdx
callq __rust_alloc@PLT
.Ltmp3:
testq %rax, %rax
je .LBB2_7
movq %rax, 48(%rsp)
movq $65535, 56(%rsp)
movw $1, %bx
xorl %esi, %esi
xorl %edx, %edx
.p2align 4, 0x90
.LBB2_11:
movzwl %bx, %edi
xorl %ecx, %ecx
cmpl $65535, %edi
setne %cl
addl %ebx, %ecx
cmpl $65535, %edi
movw %si, (%rax,%rdx,2)
leaq 1(%rdx), %rdx
movw %bx, %si
movw %cx, %bx
jne .LBB2_11
movq %rdx, 64(%rsp)
movq %rdx, 16(%r14)
movups 48(%rsp), %xmm0
movups %xmm0, (%r14)
movq %r14, %rax
addq $72, %rsp
popq %rbx
popq %r14
retq
.LBB2_2:
.Ltmp4:
leaq str.3(%rip), %rsi
movq %rsp, %rdi
movl $30, %edx
callq _ZN5alloc9allocator8AllocErr13invalid_input17hbab45037cac24347E@PLT
.Ltmp5:
movq (%rsp), %rax
movups 8(%rsp), %xmm0
movaps %xmm0, 32(%rsp)
jmp .LBB2_8
.LBB2_7:
movups 8(%rsp), %xmm0
movaps %xmm0, 32(%rsp)
.LBB2_8:
movq %rax, (%rsp)
movaps 32(%rsp), %xmm0
movups %xmm0, 8(%rsp)
.Ltmp6:
movq %rsp, %rdi
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h14166e9ef3dd1a36E
.Ltmp7:
.LBB2_13:
.Ltmp8:
movq %rax, %rbx
leaq 48(%rsp), %rdi
callq _ZN4core3ptr13drop_in_place17hdfdbb953a2d7ca4eE
movq %rbx, %rdi
callq _Unwind_Resume@PLT
.Lfunc_end2:
.size _ZN8collect219create_with_collect17h68b620e6057f6286E, .Lfunc_end2-_ZN8collect219create_with_collect17h68b620e6057f6286E
.cfi_endproc
.section .gcc_except_table,"a",@progbits
.p2align 2
GCC_except_table2:
.Lexception0:
.byte 255
.byte 155
.asciz "\234"
.byte 3
.byte 26
.long .Ltmp0-.Lfunc_begin0
.long .Ltmp7-.Ltmp0
.long .Ltmp8-.Lfunc_begin0
.byte 0
.long .Ltmp7-.Lfunc_begin0
.long .Lfunc_end2-.Ltmp7
.long 0
.byte 0
.p2align 2
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI3_0:
.quad 65535
.quad 65535 The manual version is much cleaner: Assembly output .section .text._ZN8collect215create_manually17h733d7e09fad01d19E,"ax",@progbits
.globl _ZN8collect215create_manually17h733d7e09fad01d19E
.p2align 4, 0x90
.type _ZN8collect215create_manually17h733d7e09fad01d19E,@function
_ZN8collect215create_manually17h733d7e09fad01d19E:
.cfi_startproc
pushq %rbx
.Lcfi9:
.cfi_def_cfa_offset 16
subq $48, %rsp
.Lcfi10:
.cfi_def_cfa_offset 64
.Lcfi11:
.cfi_offset %rbx, -16
movq %rdi, %rbx
leaq 8(%rsp), %rdx
movl $131070, %edi
movl $2, %esi
callq __rust_alloc_zeroed@PLT
testq %rax, %rax
je .LBB3_4
movq %rax, 8(%rsp)
movaps .LCPI3_0(%rip), %xmm0
movups %xmm0, 16(%rsp)
leaq 131070(%rax), %rcx
xorl %edx, %edx
.p2align 4, 0x90
.LBB3_2:
movw %dx, (%rax,%rdx,2)
leaq 1(%rdx), %rsi
movw %si, 2(%rax,%rdx,2)
incl %esi
movw %si, 4(%rax,%rdx,2)
leaq 3(%rdx), %rsi
movw %si, 6(%rax,%rdx,2)
incl %esi
movw %si, 8(%rax,%rdx,2)
leaq 10(%rax,%rdx,2), %rsi
addq $5, %rdx
cmpq %rcx, %rsi
jne .LBB3_2
movq 24(%rsp), %rax
movq %rax, 16(%rbx)
movups 8(%rsp), %xmm0
movups %xmm0, (%rbx)
movq %rbx, %rax
addq $48, %rsp
popq %rbx
retq
.LBB3_4:
movq 8(%rsp), %rax
movups 16(%rsp), %xmm0
movaps %xmm0, 32(%rsp)
movq %rax, 8(%rsp)
movaps 32(%rsp), %xmm0
movups %xmm0, 16(%rsp)
leaq 8(%rsp), %rdi
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h14166e9ef3dd1a36E
.Lfunc_end3:
.size _ZN8collect215create_manually17h733d7e09fad01d19E, .Lfunc_end3-_ZN8collect215create_manually17h733d7e09fad01d19E
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI4_0:
.quad 65535
.quad 65535 Compiler version:
EDIT: |
Did some more digging into this.
It seems llvm is unable to vectorize the loop for the u16 case, but it can for the u32 case, which probably explains the speed difference. I also found that there is a similar slowdown of the collect variant compared doing it manually for u64 when compiling for i686, but not for x86-64. llvm debug output for u16 variant of collect
This code also gives the same complaint about exit count as using pub fn create_with_range() -> Vec<T> {
let mut arr = vec![0;SIZE as usize];
for n in 0..SIZE {
unsafe {
*arr.get_unchecked_mut(n as usize) = n;
}
}
arr
} llvm debug output for u32 variant of collect
...etc Another observation is that according to This comment, that PR would have likely solve the issue. (As TryFrom should now inline properly as a result of #43248) |
I managed to track down the main issue: ...
%17 = getelementptr inbounds i32, i32* %ptr.0148.i.i.i.i, i64 6
%iter.sroa.0.1.i.i.i.i.6 = add nsw i32 %iter.sroa.0.0147.i.i.i.i, 7
store i32 %iter.sroa.0.1.i.i.i.i.5, i32* %17, align 4, !noalias !4
%18 = getelementptr inbounds i32, i32* %ptr.0148.i.i.i.i, i64 7
; Comparison using equality instruction:
%exitcond.i.i.i.6 = icmp eq i32 %iter.sroa.0.1.i.i.i.i.6, 32767
br i1 %exitcond.i.i.i.6, label %_ZN4core4iter8iterator8Iterator7collect17hc35826f1180bb746E.exit, label %bb35.i.i.i.i
_ZN4core4iter8iterator8Iterator7collect17hc35826f1180bb746E.exit: ; preds = %bb35.i.i.i.i
... This doesn't seem to happen with u16: ...
store i16 %iter.sroa.0.0.152.i.i.i.i, i16* %ptr.0150.i.i.i.i, align 2, !noalias !4
%12 = getelementptr inbounds i16, i16* %ptr.0150.i.i.i.i, i64 1
%13 = add i64 %local_len.sroa.5.0149.i.i.i.i, 1
; comparison using unsigned less than instruction:
%14 = icmp ult i16 %.iter.sroa.0.0151.i.i.i.i, 32767
%15 = zext i1 %14 to i16
%.iter.sroa.0.0.i.i.i.i = add i16 %15, %.iter.sroa.0.0151.i.i.i.i
br i1 %14, label %bb35.i.i.i.i, label %_ZN4core4iter8iterator8Iterator7collect17h730733b2264e9b53E.exit
... I first tried to fix this by chaning the implementation of What did help, was to change the impl of #[inline]
fn add_one(&self) -> Self {
self.checked_add(1).expect("Overflow in step!")
} This gives me benchmark results of:
Collect is now only slightly slower than the manual implementation using unsafe. (u32 gives the same results as before, i.e same as the manual implementation using unsafe.) There are still some differences between the code generated using unsafe and using collect for u16, for instance using collect adds some unwinding stuff, and the generated SIMD is a bit different, but it's still a huge improvement. |
Add an overflow check in the Iter::next() impl for Range<_> to help with vectorization. This helps with vectorization in some cases, such as (0..u16::MAX).collect::<Vec<u16>>(), as LLVM is able to change the loop condition to use equality instead of less than and should help with #43124. (See also my [last comment](#43124 (comment)) there.) This PR makes collect on ranges of u16, i16, i8, and u8 **significantly** faster (at least on x86-64 and i686), and pretty close, though not quite equivalent to a [manual unsafe implementation](https://is.gd/nkoecB). 32 ( and 64-bit values on x86-64) bit values were already vectorized without this change, and they still are. This PR doesn't seem to help with 64-bit values on i686, as they still don't vectorize well compared to doing a manual loop. I'm a bit unsure if this was the best way of implementing this, I tried to do it with as little changes as possible and avoided changing the step trait and the behavior in RangeFrom (I'll leave that for others like #43127 to discuss wider changes to the trait). I tried simply changing the comparison to `self.start != self.end` though that made the compiler segfault when compiling stage0, so I went with this method instead for now. As for `next_back()`, reverse ranges seem to optimise properly already.
I think this is solved now with #47944 due to |
I re-ran the benchmarks and the results seem identical between using collect and the unsafe manual version, so it seems this is fixed now. |
(At least on x86-64 nightly)
Using code like this:
https://is.gd/nkoecB
The version using collect is significantly slower than creating a vec of 0-values and setting the values manually.
On the other hand, if using u32 instead with the same code collect is much better:
Same with u64:
I suspect this may be SIMD-related. Will see if there are similar results on stable.
The text was updated successfully, but these errors were encountered: