Rust 1.25.0 regressed the performance of encoding_rs's UTF-8 validation on i686 #49873

hsivonen · 2018-04-11T11:51:20Z

When Firefox switched from Rust 1.24.0 to Rust 1.25.0, the win32 performance of encoding_rs's UTF-8 validation function dropped 12.5% when used on ASCII input. encoding_rs's UTF-8 validation function is a fork of the Rust standard library validation function that replaces the ASCII acceleration ALU trick that autovectorizes on x86_64 but not on i686 and works only in the aligned case with explicit SIMD code that deals with both the aligned and unaligned cases.

When the input is all ASCII, the function should stay in either the aligned-case or the unaligned-case inner loop that loads 16 bytes using movdqa or movdqu, respectively, performs pmovmskb on the xmm register and compares the result to zero jumping back to the start of the loop if it is zero.

When compiled for i686 Linux with opt level 2 (which Firefox uses) using Rust 1.24.0, the result is exactly as expected.

Unaligned:

.LBB12_3:
	movdqu	(%edx,%eax), %xmm0
	pmovmskb	%xmm0, %ebp
	testl	%ebp, %ebp
	jne	.LBB12_9
	addl	$16, %eax
	cmpl	%ebx, %eax
	jbe	.LBB12_3
	jmp	.LBB12_5
	.p2align	4, 0x90

Aligned:

.LBB12_7:
	movdqa	(%edx,%eax), %xmm0
	pmovmskb	%xmm0, %ebp
	testl	%ebp, %ebp
	jne	.LBB12_9
	addl	$16, %eax
	cmpl	%ebx, %eax
	jbe	.LBB12_7
	.p2align	4, 0x90

(Windows wouldn't let me see the asm due to LLVM deeming the IR invalid with --emit asm.)

When compiled with Rust 1.25.0, the result is more complicated:

There are two instances of movdqa and two instances of movdqu suggesting that the first trip through the loop has been unrolled to be a separate copy from the loop proper.
In the actual loop, ALU instructions have been moved around including placing one between the SSE2 instructions.

Both of these transformations look like plausible optimizations, but considering the performance result from Firefox CI, it seems these transformations made performance worse.

.LBB16_1:
	movl	%edx, %ebp
	leal	(%ecx,%edi), %ebx
	movl	$0, %esi
	subl	%edi, %ebp
	cmpl	$16, %ebp
	jb	.LBB16_22
	leal	-16(%ebp), %eax
	testb	$15, %bl
	movl	%eax, 20(%esp)
	je	.LBB16_9
	movdqu	(%ebx), %xmm0
	movl	%edx, 12(%esp)
	xorl	%eax, %eax
	pmovmskb	%xmm0, %edx
	testl	%edx, %edx
	jne	.LBB16_7
	movl	24(%esp), %eax
	xorl	%esi, %esi
	leal	(%eax,%edi), %ecx
	.p2align	4, 0x90
.LBB16_5:
	leal	16(%esi), %eax
	cmpl	20(%esp), %eax
	ja	.LBB16_20
	movdqu	(%ecx,%esi), %xmm0
	movl	%eax, %esi
	pmovmskb	%xmm0, %edx
	testl	%edx, %edx
	je	.LBB16_5
.LBB16_7:
	testl	%edx, %edx
	je	.LBB16_12
	bsfl	%edx, %esi
	jmp	.LBB16_13
.LBB16_9:
	movdqa	(%ebx), %xmm0
	xorl	%ecx, %ecx
	pmovmskb	%xmm0, %eax
	testl	%eax, %eax
	je	.LBB16_15
	testl	%eax, %eax
	je	.LBB16_19
.LBB16_11:
	bsfl	%eax, %esi
	addl	%ecx, %esi
	jmp	.LBB16_14
.LBB16_12:
	movl	$32, %esi
.LBB16_13:
	movl	12(%esp), %edx
	addl	%eax, %esi
.LBB16_14:
	movb	(%ebx,%esi), %al
	jmp	.LBB16_24
.LBB16_15:
	movl	$16, %esi
	.p2align	4, 0x90
.LBB16_16:
	cmpl	20(%esp), %esi
	ja	.LBB16_22
	movdqa	(%ebx,%esi), %xmm0
	addl	$16, %esi
	pmovmskb	%xmm0, %eax
	testl	%eax, %eax
	je	.LBB16_16
	addl	$-16, %esi
	movl	%esi, %ecx
	testl	%eax, %eax
	jne	.LBB16_11

The asm was obtained by compiling encoding_rs (Firefox uses 0.7.2) using RUSTC_BOOTSTRAP=1 RUSTFLAGS='-C opt-level=2 --emit asm' cargo build --target i686-unknown-linux-gnu --release --features simd-accel and searching for utf8_valid_up_to in the .s file.

The text was updated successfully, but these errors were encountered:

glandium · 2018-04-11T11:56:24Z

Note that I observed something similar on x86-64 (and Firefox has seen a regression on x86-64 too).

hsivonen · 2018-04-11T12:04:31Z

The loop is inlined from validate_ascii.

hsivonen · 2018-04-11T12:36:30Z

The presumed unrolling of the first loop trip happens before LLVM, but the inner loop (which has to be what's slow to get the 12.5% drop in perf) looks the same on the IR level, so this is probably an LLVM 4.0 vs. LLVM 6.0 thing.

IR from 1.24.0:

; encoding_rs::Encoding::ascii_valid_up_to
; Function Attrs: readonly uwtable
define i32 @_ZN11encoding_rs8Encoding17ascii_valid_up_to17h3e9b04dc68770813E([0 x i8]* noalias nonnull readonly %bytes.0, i32 %bytes.1) unnamed_addr #6 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %0 = icmp ugt i32 %bytes.1, 15
  br i1 %0, label %bb3.i.i, label %bb30.preheader.i.i

bb3.i.i:                                          ; preds = %start
  %1 = add i32 %bytes.1, -16
  %2 = ptrtoint [0 x i8]* %bytes.0 to i32
  %3 = and i32 %2, 15
  %4 = icmp eq i32 %3, 0
  br i1 %4, label %bb5.i.i.preheader, label %bb18.i.i.preheader

bb18.i.i.preheader:                               ; preds = %bb3.i.i
  br label %bb18.i.i

bb5.i.i.preheader:                                ; preds = %bb3.i.i
  br label %bb5.i.i

bb5.i.i:                                          ; preds = %bb5.i.i.preheader, %bb10.i.i
  %offset.0.i.i = phi i32 [ %10, %bb10.i.i ], [ 0, %bb5.i.i.preheader ]
  %5 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %offset.0.i.i
  %6 = bitcast i8* %5 to <16 x i8>*
  %7 = load <16 x i8>, <16 x i8>* %6, align 16, !alias.scope !6596, !noalias !6601
  %8 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %7) #7
  %9 = icmp eq i32 %8, 0
  br i1 %9, label %bb10.i.i, label %bb15.sink.split.i.i.loopexit

bb10.i.i:                                         ; preds = %bb5.i.i
  %10 = add i32 %offset.0.i.i, 16
  %11 = icmp ugt i32 %10, %1
  br i1 %11, label %bb30.preheader.i.i.loopexit, label %bb5.i.i

bb15.sink.split.i.i.loopexit:                     ; preds = %bb5.i.i
  br label %bb15.sink.split.i.i

bb15.sink.split.i.i.loopexit37:                   ; preds = %bb18.i.i
  br label %bb15.sink.split.i.i

bb15.sink.split.i.i:                              ; preds = %bb15.sink.split.i.i.loopexit37, %bb15.sink.split.i.i.loopexit
  %.sink35.i.i = phi i32 [ %8, %bb15.sink.split.i.i.loopexit ], [ %15, %bb15.sink.split.i.i.loopexit37 ]
  %offset.1.sink.i.i = phi i32 [ %offset.0.i.i, %bb15.sink.split.i.i.loopexit ], [ %offset.1.i.i, %bb15.sink.split.i.i.loopexit37 ]
  %12 = tail call i32 @llvm.cttz.i32(i32 %.sink35.i.i, i1 false) #7
  %13 = add i32 %offset.1.sink.i.i, %12
  br label %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit

bb18.i.i:                                         ; preds = %bb18.i.i.preheader, %bb23.i.i
  %offset.1.i.i = phi i32 [ %17, %bb23.i.i ], [ 0, %bb18.i.i.preheader ]
  %14 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %offset.1.i.i
  %simd.0.ptr.sroa_cast.i.i.i = bitcast i8* %14 to <16 x i8>*
  %simd.0.copyload.i.i.i = load <16 x i8>, <16 x i8>* %simd.0.ptr.sroa_cast.i.i.i, align 1, !alias.scope !6596, !noalias !6601
  %15 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %simd.0.copyload.i.i.i) #7
  %16 = icmp eq i32 %15, 0
  br i1 %16, label %bb23.i.i, label %bb15.sink.split.i.i.loopexit37

bb23.i.i:                                         ; preds = %bb18.i.i
  %17 = add i32 %offset.1.i.i, 16
  %18 = icmp ugt i32 %17, %1
  br i1 %18, label %bb30.preheader.i.i.loopexit38, label %bb18.i.i

bb30.preheader.i.i.loopexit:                      ; preds = %bb10.i.i
  br label %bb30.preheader.i.i

bb30.preheader.i.i.loopexit38:                    ; preds = %bb23.i.i
  br label %bb30.preheader.i.i

bb30.preheader.i.i:                               ; preds = %bb30.preheader.i.i.loopexit38, %bb30.preheader.i.i.loopexit, %start
  %offset.2.ph.i.i = phi i32 [ 0, %start ], [ %10, %bb30.preheader.i.i.loopexit ], [ %17, %bb30.preheader.i.i.loopexit38 ]
  %19 = icmp ult i32 %offset.2.ph.i.i, %bytes.1
  br i1 %19, label %bb33.i.i.preheader, label %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit

bb33.i.i.preheader:                               ; preds = %bb30.preheader.i.i
  br label %bb33.i.i

bb33.i.i:                                         ; preds = %bb33.i.i.preheader, %bb35.i.i
  %offset.247.i.i = phi i32 [ %23, %bb35.i.i ], [ %offset.2.ph.i.i, %bb33.i.i.preheader ]
  %20 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %offset.247.i.i
  %21 = load i8, i8* %20, align 1, !alias.scope !6596, !noalias !6601
  %22 = icmp slt i8 %21, 0
  br i1 %22, label %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit.loopexit, label %bb35.i.i

bb35.i.i:                                         ; preds = %bb33.i.i
  %23 = add nuw i32 %offset.247.i.i, 1
  %24 = icmp ult i32 %23, %bytes.1
  br i1 %24, label %bb33.i.i, label %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit.loopexit

_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit.loopexit: ; preds = %bb35.i.i, %bb33.i.i
  %_0.0.i.ph = phi i32 [ %bytes.1, %bb35.i.i ], [ %offset.247.i.i, %bb33.i.i ]
  br label %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit

_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit: ; preds = %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit.loopexit, %bb15.sink.split.i.i, %bb30.preheader.i.i
  %_0.0.i = phi i32 [ %bytes.1, %bb30.preheader.i.i ], [ %13, %bb15.sink.split.i.i ], [ %_0.0.i.ph, %_ZN11encoding_rs5ascii17ascii_valid_up_to17h5c1d65b6f83b80ecE.exit.loopexit ]
  ret i32 %_0.0.i
}

IR from 1.25.0:

; encoding_rs::utf_8::utf8_valid_up_to
; Function Attrs: uwtable
define internal fastcc i32 @_ZN11encoding_rs5utf_816utf8_valid_up_to17he6ec0248b61d42c5E([0 x i8]* noalias nonnull readonly %bytes.0, i32 %bytes.1) unnamed_addr #4 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  br label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17he71b553e81d1e1aeE.exit.i"

bb1.i.i.i.i:                                      ; preds = %bb73.i
; call core::slice::slice_index_order_fail
  tail call void @_ZN4core5slice22slice_index_order_fail17hd587cac96d801a96E(i32 %94, i32 %bytes.1)
  unreachable

"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17he71b553e81d1e1aeE.exit.i": ; preds = %bb73.i, %start
  %index.0151.i = phi i32 [ 0, %start ], [ %94, %bb73.i ]
  %0 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %index.0151.i
  %1 = sub i32 %bytes.1, %index.0151.i
  %2 = icmp ugt i32 %1, 15
  br i1 %2, label %bb3.i.i, label %bb29.i.i

bb3.i.i:                                          ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17he71b553e81d1e1aeE.exit.i"
  %3 = add i32 %1, -16
  %4 = ptrtoint i8* %0 to i32
  %5 = and i32 %4, 15
  %6 = icmp eq i32 %5, 0
  %7 = bitcast i8* %0 to <16 x i8>*
  br i1 %6, label %bb5.preheader.i.i, label %bb18.preheader.i.i

bb18.preheader.i.i:                               ; preds = %bb3.i.i
  %simd.0.copyload.i46.i.i = load <16 x i8>, <16 x i8>* %7, align 1, !alias.scope !27, !noalias !32
  %8 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %simd.0.copyload.i46.i.i) #10
  %9 = icmp eq i32 %8, 0
  br i1 %9, label %bb23.i.i.preheader, label %bb22.i.i

bb23.i.i.preheader:                               ; preds = %bb18.preheader.i.i
  br label %bb23.i.i

bb5.preheader.i.i:                                ; preds = %bb3.i.i
  %10 = load <16 x i8>, <16 x i8>* %7, align 16, !alias.scope !27, !noalias !37
  %11 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %10) #10
  %12 = icmp eq i32 %11, 0
  br i1 %12, label %bb10.i.i.preheader, label %bb9.i.i

bb10.i.i.preheader:                               ; preds = %bb5.preheader.i.i
  br label %bb10.i.i

bb5.i.i:                                          ; preds = %bb10.i.i
  %13 = getelementptr inbounds i8, i8* %0, i32 %20
  %14 = bitcast i8* %13 to <16 x i8>*
  %15 = load <16 x i8>, <16 x i8>* %14, align 16, !alias.scope !27, !noalias !37
  %16 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %15) #10
  %17 = icmp eq i32 %16, 0
  br i1 %17, label %bb10.i.i, label %bb9.i.i

bb9.i.i:                                          ; preds = %bb5.i.i, %bb5.preheader.i.i
  %offset.0.lcssa.i.i = phi i32 [ 0, %bb5.preheader.i.i ], [ %20, %bb5.i.i ]
  %.lcssa34.i.i = phi i32 [ %11, %bb5.preheader.i.i ], [ %16, %bb5.i.i ]
  %18 = tail call i32 @llvm.cttz.i32(i32 %.lcssa34.i.i, i1 false) #10, !range !40
  %19 = add i32 %18, %offset.0.lcssa.i.i
  br label %bb7.i.sink.split

bb10.i.i:                                         ; preds = %bb10.i.i.preheader, %bb5.i.i
  %offset.043.i.i = phi i32 [ %20, %bb5.i.i ], [ 0, %bb10.i.i.preheader ]
  %20 = add i32 %offset.043.i.i, 16
  %21 = icmp ugt i32 %20, %3
  br i1 %21, label %bb29.i.i, label %bb5.i.i

bb18.i.i:                                         ; preds = %bb23.i.i
  %22 = getelementptr inbounds i8, i8* %0, i32 %27
  %simd.0.ptr.sroa_cast.i.i.i = bitcast i8* %22 to <16 x i8>*
  %simd.0.copyload.i.i.i = load <16 x i8>, <16 x i8>* %simd.0.ptr.sroa_cast.i.i.i, align 1, !alias.scope !27, !noalias !32
  %23 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %simd.0.copyload.i.i.i) #10
  %24 = icmp eq i32 %23, 0
  br i1 %24, label %bb23.i.i, label %bb22.i.i

bb22.i.i:                                         ; preds = %bb18.i.i, %bb18.preheader.i.i
  %offset.1.lcssa.i.i = phi i32 [ 0, %bb18.preheader.i.i ], [ %27, %bb18.i.i ]
  %.lcssa38.i.i = phi i32 [ %8, %bb18.preheader.i.i ], [ %23, %bb18.i.i ]
  %25 = tail call i32 @llvm.cttz.i32(i32 %.lcssa38.i.i, i1 false) #10, !range !40
  %26 = add i32 %25, %offset.1.lcssa.i.i
  br label %bb7.i.sink.split

bb23.i.i:                                         ; preds = %bb23.i.i.preheader, %bb18.i.i
  %offset.147.i.i = phi i32 [ %27, %bb18.i.i ], [ 0, %bb23.i.i.preheader ]
  %27 = add i32 %offset.147.i.i, 16
  %28 = icmp ugt i32 %27, %3
  br i1 %28, label %bb29.i.i, label %bb18.i.i

bb29.i.i:                                         ; preds = %bb23.i.i, %bb10.i.i, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17he71b553e81d1e1aeE.exit.i"
  %offset.2.i.i = phi i32 [ 0, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17he71b553e81d1e1aeE.exit.i" ], [ %20, %bb10.i.i ], [ %27, %bb23.i.i ]
  %29 = icmp ult i32 %offset.2.i.i, %1
  br i1 %29, label %bb33.i.i.preheader, label %bb5

bb33.i.i.preheader:                               ; preds = %bb29.i.i
  br label %bb33.i.i

bb33.i.i:                                         ; preds = %bb33.i.i.preheader, %bb35.i.i
  %offset.342.i.i = phi i32 [ %33, %bb35.i.i ], [ %offset.2.i.i, %bb33.i.i.preheader ]
  %30 = getelementptr inbounds i8, i8* %0, i32 %offset.342.i.i
  %31 = load i8, i8* %30, align 1, !alias.scope !27, !noalias !41
  %32 = icmp slt i8 %31, 0
  br i1 %32, label %bb7.i, label %bb35.i.i

bb35.i.i:                                         ; preds = %bb33.i.i
  %33 = add nuw i32 %offset.342.i.i, 1
  %34 = icmp ult i32 %33, %1
  br i1 %34, label %bb33.i.i, label %bb5

bb7.i.sink.split:                                 ; preds = %bb9.i.i, %bb22.i.i
  %.sink66 = phi i32 [ %26, %bb22.i.i ], [ %19, %bb9.i.i ]
  %35 = getelementptr inbounds i8, i8* %0, i32 %.sink66
  %36 = load i8, i8* %35, align 1, !alias.scope !27, !noalias !41
  br label %bb7.i

bb7.i:                                            ; preds = %bb33.i.i, %bb7.i.sink.split
  %_12.sroa.1294.1.ph.i = phi i32 [ %.sink66, %bb7.i.sink.split ], [ %offset.342.i.i, %bb33.i.i ]
  %_12.sroa.8.1.ph.i = phi i8 [ %36, %bb7.i.sink.split ], [ %31, %bb33.i.i ]
  %37 = add i32 %_12.sroa.1294.1.ph.i, %index.0151.i
  br label %bb9.i

bb9.i:                                            ; preds = %bb72.i, %bb7.i
  %first.0.i = phi i8 [ %_12.sroa.8.1.ph.i, %bb7.i ], [ %92, %bb72.i ]
  %index.1.i = phi i32 [ %37, %bb7.i ], [ %47, %bb72.i ]
  %38 = zext i8 %first.0.i to i32
  %39 = getelementptr inbounds [256 x i8], [256 x i8]* @_ZN11encoding_rs10utf_8_core15UTF8_CHAR_WIDTH17hf8319a77b9cec8e4E, i32 0, i32 %38
  %40 = load i8, i8* %39, align 1
  switch i8 %40, label %bb5 [
    i8 2, label %bb11.i
    i8 3, label %bb12.i
    i8 4, label %bb13.i
  ]

bb11.i:                                           ; preds = %bb9.i
  %41 = add i32 %index.1.i, 1
  %42 = icmp ult i32 %41, %bytes.1
  br i1 %42, label %bb20.i, label %bb5

bb12.i:                                           ; preds = %bb9.i
  %43 = add i32 %index.1.i, 1
  %44 = icmp ult i32 %43, %bytes.1
  br i1 %44, label %bb25.i, label %bb5

bb13.i:                                           ; preds = %bb9.i
  %45 = add i32 %index.1.i, 1
  %46 = icmp ult i32 %45, %bytes.1
  br i1 %46, label %bb48.i, label %bb5

bb15.i:                                           ; preds = %bb67.i, %bb43.i, %bb20.i
  %index.2.i = phi i32 [ %84, %bb67.i ], [ %59, %bb43.i ], [ %41, %bb20.i ]
  %47 = add i32 %index.2.i, 1
  %48 = icmp eq i32 %47, %bytes.1
  br i1 %48, label %bb5, label %bb71.i

bb20.i:                                           ; preds = %bb11.i
  %49 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %41
  %50 = load i8, i8* %49, align 1, !alias.scope !42, !noalias !43
  %51 = and i8 %50, -64
  %52 = icmp eq i8 %51, -128
  br i1 %52, label %bb15.i, label %bb5

bb25.i:                                           ; preds = %bb12.i
  %53 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %43
  %54 = load i8, i8* %53, align 1, !alias.scope !42, !noalias !43
  %cond9.i = icmp eq i8 %first.0.i, -32
  %55 = icmp ult i8 %54, -64
  %56 = and i8 %54, -32
  %57 = icmp eq i8 %56, -96
  %58 = and i1 %cond9.i, %57
  br i1 %58, label %bb26.i, label %bb30.i

bb26.i:                                           ; preds = %bb37.i, %bb34.i, %bb30.i, %bb25.i
  %59 = add i32 %index.1.i, 2
  %60 = icmp ult i32 %59, %bytes.1
  br i1 %60, label %bb43.i, label %bb5

bb30.i:                                           ; preds = %bb25.i
  %first.0.off101.i = add i8 %first.0.i, 31
  %61 = icmp ult i8 %first.0.off101.i, 12
  %62 = icmp slt i8 %54, 0
  %or.cond76.i = and i1 %61, %62
  %or.cond77.i = and i1 %55, %or.cond76.i
  br i1 %or.cond77.i, label %bb26.i, label %bb34.i

bb34.i:                                           ; preds = %bb30.i
  %cond10.i = icmp eq i8 %first.0.i, -19
  %or.cond78.i = and i1 %cond10.i, %62
  %63 = icmp ult i8 %54, -96
  %or.cond79.i = and i1 %63, %or.cond78.i
  br i1 %or.cond79.i, label %bb26.i, label %bb37.i

bb37.i:                                           ; preds = %bb34.i
  %64 = and i8 %first.0.i, -2
  %65 = icmp eq i8 %64, -18
  %or.cond81.i = and i1 %65, %62
  %or.cond82.i = and i1 %55, %or.cond81.i
  br i1 %or.cond82.i, label %bb26.i, label %bb5

bb43.i:                                           ; preds = %bb26.i
  %66 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %59
  %67 = load i8, i8* %66, align 1, !alias.scope !42, !noalias !43
  %68 = and i8 %67, -64
  %69 = icmp eq i8 %68, -128
  br i1 %69, label %bb15.i, label %bb5

bb48.i:                                           ; preds = %bb13.i
  %70 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %45
  %71 = load i8, i8* %70, align 1, !alias.scope !42, !noalias !43
  %cond.i = icmp eq i8 %first.0.i, -16
  %.off.i = add i8 %71, 112
  %72 = icmp ult i8 %.off.i, 48
  %73 = and i1 %cond.i, %72
  br i1 %73, label %bb49.i, label %bb53.i

bb49.i:                                           ; preds = %bb57.i, %bb53.i, %bb48.i
  %74 = add i32 %index.1.i, 2
  %75 = icmp ult i32 %74, %bytes.1
  br i1 %75, label %bb62.i, label %bb5

bb53.i:                                           ; preds = %bb48.i
  %76 = icmp ult i8 %71, -64
  %first.0.off.i = add i8 %first.0.i, 15
  %77 = icmp ult i8 %first.0.off.i, 3
  %78 = icmp slt i8 %71, 0
  %or.cond86.i = and i1 %77, %78
  %or.cond87.i = and i1 %76, %or.cond86.i
  br i1 %or.cond87.i, label %bb49.i, label %bb57.i

bb57.i:                                           ; preds = %bb53.i
  %cond8.i = icmp eq i8 %first.0.i, -12
  %or.cond88.i = and i1 %cond8.i, %78
  %79 = icmp ult i8 %71, -112
  %or.cond89.i = and i1 %79, %or.cond88.i
  br i1 %or.cond89.i, label %bb49.i, label %bb5

bb62.i:                                           ; preds = %bb49.i
  %80 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %74
  %81 = load i8, i8* %80, align 1, !alias.scope !42, !noalias !43
  %82 = and i8 %81, -64
  %83 = icmp eq i8 %82, -128
  br i1 %83, label %bb64.i, label %bb5

bb64.i:                                           ; preds = %bb62.i
  %84 = add i32 %index.1.i, 3
  %85 = icmp ult i32 %84, %bytes.1
  br i1 %85, label %bb67.i, label %bb5

bb67.i:                                           ; preds = %bb64.i
  %86 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %84
  %87 = load i8, i8* %86, align 1, !alias.scope !42, !noalias !43
  %88 = and i8 %87, -64
  %89 = icmp eq i8 %88, -128
  br i1 %89, label %bb15.i, label %bb5

bb71.i:                                           ; preds = %bb15.i
  %90 = icmp ult i32 %47, %bytes.1
  br i1 %90, label %bb72.i, label %panic7.i, !prof !44

bb72.i:                                           ; preds = %bb71.i
  %91 = getelementptr inbounds [0 x i8], [0 x i8]* %bytes.0, i32 0, i32 %47
  %92 = load i8, i8* %91, align 1, !alias.scope !42, !noalias !43
  %93 = icmp sgt i8 %92, -1
  br i1 %93, label %bb73.i, label %bb9.i

bb73.i:                                           ; preds = %bb72.i
  %94 = add i32 %index.2.i, 2
  %95 = icmp ugt i32 %94, %bytes.1
  br i1 %95, label %bb1.i.i.i.i, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17he71b553e81d1e1aeE.exit.i"

panic7.i:                                         ; preds = %bb71.i
; call core::panicking::panic_bounds_check
  tail call void @_ZN4core9panicking18panic_bounds_check17h24c830ebecd41baaE({ [0 x i32], { [0 x i8]*, i32 }, [0 x i32], i32, [0 x i32], i32, [0 x i32] }* noalias readonly dereferenceable(16) bitcast ({ { [0 x i8]*, i32 }, i32, i32 }* @panic_bounds_check_loc.C to { [0 x i32], { [0 x i8]*, i32 }, [0 x i32], i32, [0 x i32], i32, [0 x i32] }*), i32 %47, i32 %bytes.1)
  unreachable

bb5:                                              ; preds = %bb29.i.i, %bb35.i.i, %bb67.i, %bb64.i, %bb62.i, %bb49.i, %bb57.i, %bb13.i, %bb43.i, %bb26.i, %bb37.i, %bb12.i, %bb20.i, %bb11.i, %bb9.i, %bb15.i
  %_0.0 = phi i32 [ %37, %bb67.i ], [ %37, %bb64.i ], [ %37, %bb62.i ], [ %37, %bb49.i ], [ %37, %bb57.i ], [ %37, %bb13.i ], [ %37, %bb43.i ], [ %37, %bb26.i ], [ %37, %bb37.i ], [ %37, %bb12.i ], [ %37, %bb20.i ], [ %37, %bb11.i ], [ %37, %bb9.i ], [ %bytes.1, %bb15.i ], [ %bytes.1, %bb35.i.i ], [ %bytes.1, %bb29.i.i ]
  ret i32 %_0.0
}

hsivonen · 2018-04-11T12:46:17Z

Code generated by the current nightly looks like the code generated by 1.25.0.

hsivonen · 2018-04-11T13:01:12Z

opt-level=3 does not fix this.

michaelwoerister · 2018-04-11T13:46:39Z

cc @rust-lang/wg-codegen

hsivonen · 2018-04-12T12:14:59Z

What instruction scheduling model does rustc use by default?

The change looks like it's a result of instruction scheduling trickery, and the LLVM 6.0 release notes say: "Added instruction scheduling information for Intel Sandy Bridge, Ivy Bridge, Haswell, Broadwell, and Skylake CPUs."

Notably, when running cargo bench on Haswell outside the Gecko context (but with opt-level=2), I was unable to reproduce the performance degradation on the x86_64 Linux target.

How can I choose an instruction scheduling model without also enabling the instruction set extensions of the named microarchitecture?

nagisa · 2018-04-12T14:33:43Z

My guess (can't check atm, not at a computer) is that instruction scheduling is chosen by LLVM depending on the target cpu that is specified in the target files and for x86_64 that is the most conservative option of... x86_64 (no particular cpu, and definitely not haswell+). My guess would be that within gecko context flags that may influence this selection (-Ctarget-cpu -Ctarget-feature etc) are being set to some value that correspond to a more recent cpu.

…

On Thu, Apr 12, 2018, 15:15 Henri Sivonen ***@***.***> wrote: What instruction scheduling model does rustc use by default? The change looks like it's a result of instruction scheduling trickery, and the LLVM 6.0 release notes say: "Added instruction scheduling information for Intel Sandy Bridge, Ivy Bridge, Haswell, Broadwell, and Skylake CPUs." Notably, when running cargo bench on Haswell outside the Gecko context (but with opt-level=2), I was unable to reproduce the performance degradation on the x86_64 Linux target. How can I choose an instruction scheduling model without also enabling the instruction set extensions of the named microarchitecture? — You are receiving this because you are on a team that was mentioned. Reply to this email directly, view it on GitHub <#49873 (comment)>, or mute the thread <https://github.com/notifications/unsubscribe-auth/AApc0pZPt3WQVt-9lXWmpvoZZLjHH3cJks5tn0VzgaJpZM4TP4gW> .

hsivonen · 2018-04-19T08:29:32Z

As far as I can tell, Gecko doesn't set target-cpu or target-feature using the -C flag.

LLVM seems to have microarchitecture-specific sceduling models from Sandy Bridge upwards, and between LLVM 4.0 and 6.0 Intel engineers landed major rewrites of the Sandy Bridge and Haswell scheduling models citing information obtained from the architects of these microarchitectures.

It's unclear to me where the default x86 and x86_64 scheduling models reside in the LLVM source.

hsivonen · 2018-04-19T09:06:29Z

Note that I observed something similar on x86-64 (and Firefox has seen a regression on x86-64 too).

The code generated for x86_64 for the unrolled innermost loop looks closer to expected, though. Aligned case:

.LBB16_14:
	cmpq	%r15, %rdx
	ja	.LBB16_21
	movdqa	(%r11,%rdx), %xmm0
	pmovmskb	%xmm0, %ebx
	addq	$16, %rdx
	testl	%ebx, %ebx
	je	.LBB16_14
	addq	$-16, %rdx
	testl	%ebx, %ebx
	jne	.LBB16_12
	jmp	.LBB16_17

hsivonen · 2018-04-19T09:17:33Z

It seems that for the i686 targets, the base CPU is pentium4. Changing it to pentium-m does not help. However, changing it to x86-64, which is the base CPU for the x86_64 targets results in similar instruction scheduling as when actually targeting 64-bit code. (Changing it to haswell without also changing target-feature results in different instruction selection.)

hsivonen · 2018-04-19T10:15:16Z

There are two instances of movdqa and two instances of movdqu suggesting that the first trip through the loop has been unrolled to be a separate copy from the loop proper.

In the actual loop, ALU instructions have been moved around including placing one between the SSE2 instructions.

I checked old nightlies. The change occurred exactly when rustc switched from LLVM 4.0 to 6.0.

Effect 1., the unrolling, happens on the IR level regardless of target-cpu. Effect 2. seems to be tied to CPU model-specific instruction scheduling for Pentium 4 and Pentium M at least, so effect 2. can be avoided by picking a different target-cpu.

Does rustc expose controls for omitting certain LLVM IR transformation passes? How should I go about locating the cause of effect 1.?

hsivonen · 2018-04-19T12:05:20Z

Here are the LLVM passes (64-bit target) from the last LLVM 4.0 build of rustc and the first LLVM 6.0 build of rustc.

I'm not familiar with LLVM internals, so the only thing that stands out to me is that the Natural Loop Information pass seems to have moved relative to other passes in some cases and in some cases there are additional Machine Natural Loop Construction passes.

hsivonen · 2018-04-19T12:21:54Z

Remedying effect 2 isn't much of a remedy according to Gecko benchmarks.

It seems implausible that the unrolled one trip through the loop could be so slow as to explain the original slow-down, so I assume that the slow-down has to be attributable to the unrolling causing movement of code within the inner loop--specifically causing the index-based loop exit criteria to move to above the buffer content check in the basic block forming the inner loop.

hsivonen · 2018-04-19T12:35:59Z

The LLVM 4.0 to 6.0 switch happened between nightly-2018-02-10 and nightly-2018-02-11.

alexcrichton · 2018-04-27T01:02:28Z

@hsivonen do you have the data necessary to reproduce the runtime regression here? I managed to extract ascii_valid_up_to into a single file so we can hopefully send it to upstream LLVM when we've reduced it enough, but with some cursory testing I'm unable to reproduce the time regression seen here.

hsivonen · 2018-04-27T08:41:20Z

Thank you. I minimized the test case even further, tweaked the benchmark to match Firefox memory alignment and iterations, added scripts to run the code easily and managed to reproduce the regression in x86_64 mode on two different microarchitectures.

hsivonen · 2018-04-27T08:58:52Z

I filed an LLVM bug.

eddyb · 2018-04-27T10:37:26Z

@hsivonen You might want to attach some LLVM IR samples to the the LLVM bug report - or even a C(++) source, if it's easy enough to do - to make it easier for LLVM developers to test potential fixes.

(In my experience, LLVM developers don't tend to try LLVM changes with custom Rust builds, but they can easily have clang within their LLVM build tree to quickly try various C(++) testcases.)

alexcrichton · 2018-04-27T15:14:23Z

I've commented with a pure C example to help the LLVM devs dig into

nikic · 2018-05-14T20:34:37Z

This is caused by the loop rotation pass. Using the IR https://gist.github.com/nikic/d55cb75635a52a29790248b6bcd24be8 running opt -loop-rotate shows the issue. It's somewhat fragile though, e.g. -simplifycfg -loop-rotate won't work. On LLVM trunk -O2 also no longer seems to exhibit the problem.

nikic · 2018-05-15T15:39:47Z

I haven't tested, but I suspect that backporting llvm-mirror/llvm@4c64dfe would fix this regression.

The problem is as follows:

Loop rotation does not happen if the (unique) loop latch is loop exiting (with some exceptions).
At the point where loop rotation occurs, the loop latch is an unconditional branch from an empty block. Without indirection through this dummy block, the loop latch would be loop exiting.
Prior to the commit referenced above, CFG simplification with canonical loop preservation was not able to remove this extra block.

nikomatsakis · 2018-05-15T16:42:46Z

@nikic nice =)

Maybe someone can take a stab at evaluating that?

nox · 2018-05-15T16:45:40Z

I can try that later tonight, but I feel like I'm going to come back and it will have been done already hah.

nikic · 2018-05-15T17:03:50Z

Verified that cherry-picking the commit fixes the regression.

This brings in rust-lang/llvm#115, which fixes rust-lang#49873.

Update LLVM to 56c931901cfb85cd6f7ed44c7d7520a8de1edf97 This brings in rust-lang/llvm#115, which fixes rust-lang#49873.

hsivonen · 2018-05-21T08:43:01Z

Thank you! I verified that the performance of the test case improved with the latest nightly.

Is this kind of LLVM cherrypick eligible to be nominated for cherrypicking into Rust beta? How does one nominate beta backports?

alexcrichton · 2018-05-21T15:12:59Z

I've added the beta-nominated tag to #50827 which is how we process backports (the compiler team is also tagged to come up during triage)

This brings in rust-lang/llvm#115, which fixes #49873.

nikomatsakis · 2018-06-07T15:01:41Z

The backport seems to be causing issues on beta that don't occur on nightly -- the origins of this are not known, perhaps some other fixes that landed in the meantime? (The failing test is myriad-closures, and it fails only on 32-bit windows, afaik -- it may be a memory usage issue?)

@hsivonen how much do you care that this is backported? We're trying to decide how many resources to devote to tracking this down vs fixing other things.

SimonSapin · 2018-06-07T15:23:44Z

Firefox is not updating its compiler from 1.24 (https://bugzilla.mozilla.org/show_bug.cgi?id=1447116) in part because of this. (@glandium can say more about the other blocker.)

eddyb · 2018-06-07T17:05:39Z

@nikomatsakis myriad-closures is indeed a stress test, and memory usage changes make sense.

glandium · 2018-06-07T21:11:09Z

Firefox is going to stay on 1.24 until it can jump to 1.28. So, for Firefox, it doesn't matter at all that this makes it to beta.

djc · 2018-06-08T06:51:38Z

@glandium where's the discussion/rationale on that? I searched Bugzilla but couldn't find it.

glandium · 2018-06-08T07:06:10Z

@djc beta is 1.27, and 1.27 doesn't allow to get the allocation size on oom, and that's important to us. 1.28 does.

hsivonen · 2018-06-08T08:02:37Z

@nikomatsakis , given what @glandium said about jumping to 1.28 anyway, I no longer care about a backport.

nikomatsakis added the P-medium Medium priority label Apr 26, 2018

upsuper mentioned this issue May 10, 2018

LLVM 6 upgrade caused ~25% runtime regression for inflate 0.3.4 decoding #50560

Closed

nikic mentioned this issue May 14, 2018

Emit noalias on &mut parameters by default #50744

Merged

nikic mentioned this issue May 15, 2018

Backport fix for encoding_rs performance regression rust-lang/llvm#115

Merged

nox mentioned this issue May 17, 2018

Update LLVM to 56c931901cfb85cd6f7ed44c7d7520a8de1edf97 #50827

Merged

nox added a commit to nox/rust that referenced this issue May 18, 2018

Update LLVM to 56c931901cfb85cd6f7ed44c7d7520a8de1edf97

032831d

This brings in rust-lang/llvm#115, which fixes rust-lang#49873.

kennytm added a commit to kennytm/rust that referenced this issue May 19, 2018

Rollup merge of rust-lang#50827 - nox:llvmup, r=eddyb

fbfce83

Update LLVM to 56c931901cfb85cd6f7ed44c7d7520a8de1edf97 This brings in rust-lang/llvm#115, which fixes rust-lang#49873.

bors closed this as completed in #50827 May 20, 2018

bors pushed a commit that referenced this issue Jun 1, 2018

Update LLVM to 56c931901cfb85cd6f7ed44c7d7520a8de1edf97

7e72f21

This brings in rust-lang/llvm#115, which fixes #49873.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Rust 1.25.0 regressed the performance of encoding_rs's UTF-8 validation on i686 #49873

Rust 1.25.0 regressed the performance of encoding_rs's UTF-8 validation on i686 #49873

hsivonen commented Apr 11, 2018

glandium commented Apr 11, 2018

hsivonen commented Apr 11, 2018

hsivonen commented Apr 11, 2018

hsivonen commented Apr 11, 2018

hsivonen commented Apr 11, 2018

michaelwoerister commented Apr 11, 2018

hsivonen commented Apr 12, 2018

nagisa commented Apr 12, 2018 via email

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

alexcrichton commented Apr 27, 2018

hsivonen commented Apr 27, 2018 •

edited

Loading

hsivonen commented Apr 27, 2018

eddyb commented Apr 27, 2018

alexcrichton commented Apr 27, 2018

nikic commented May 14, 2018

nikic commented May 15, 2018 •

edited

Loading

nikomatsakis commented May 15, 2018

nox commented May 15, 2018

nikic commented May 15, 2018

hsivonen commented May 21, 2018

alexcrichton commented May 21, 2018

nikomatsakis commented Jun 7, 2018

SimonSapin commented Jun 7, 2018

eddyb commented Jun 7, 2018

glandium commented Jun 7, 2018

djc commented Jun 8, 2018

glandium commented Jun 8, 2018

hsivonen commented Jun 8, 2018

Rust 1.25.0 regressed the performance of encoding_rs's UTF-8 validation on i686 #49873

Rust 1.25.0 regressed the performance of encoding_rs's UTF-8 validation on i686 #49873

Comments

hsivonen commented Apr 11, 2018

glandium commented Apr 11, 2018

hsivonen commented Apr 11, 2018

hsivonen commented Apr 11, 2018

hsivonen commented Apr 11, 2018

hsivonen commented Apr 11, 2018

michaelwoerister commented Apr 11, 2018

hsivonen commented Apr 12, 2018

nagisa commented Apr 12, 2018 via email

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

hsivonen commented Apr 19, 2018

alexcrichton commented Apr 27, 2018

hsivonen commented Apr 27, 2018 • edited Loading

hsivonen commented Apr 27, 2018

eddyb commented Apr 27, 2018

alexcrichton commented Apr 27, 2018

nikic commented May 14, 2018

nikic commented May 15, 2018 • edited Loading

nikomatsakis commented May 15, 2018

nox commented May 15, 2018

nikic commented May 15, 2018

hsivonen commented May 21, 2018

alexcrichton commented May 21, 2018

nikomatsakis commented Jun 7, 2018

SimonSapin commented Jun 7, 2018

eddyb commented Jun 7, 2018

glandium commented Jun 7, 2018

djc commented Jun 8, 2018

glandium commented Jun 8, 2018

hsivonen commented Jun 8, 2018

hsivonen commented Apr 27, 2018 •

edited

Loading

nikic commented May 15, 2018 •

edited

Loading