-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bail out early from truncate
if the new length is equal to the old
#74172
Conversation
`truncate` should do nothing for collections if the new length is equal to the current length, but this was not reflected in the code. Instead, control flow continued until an empty slice was passed to `drop_in_place`.
(rust_highfive has picked a reviewer for you, use r? to override) |
Seems like this results in an additional branch for Maybe we just want to close this? Or I could implement |
I don't think this makes the code clearer, and given the mixed (probably minor) performance impact, I'm especially reluctant to merge this. I also don't think duplicating code for |
The new version of `truncate` compiles up a branch when called with a constant `0` and `T: !Drop`.
I've replaced Old: # %bb.0:
push rbp
push r15
push r14
push r13
push r12
push rbx
push rax
mov r12, rsi
mov r15, qword ptr [rsi + 16]
mov r13, qword ptr [rdi + 16]
cmp r13, r15
jb .LBB2_6 # EXTRA BRANCH HERE
# %bb.1:
mov rbx, qword ptr [rdi]
mov qword ptr [rdi + 16], r15
jne .LBB2_3
# %bb.2:
mov r13, r15
jmp .LBB2_8
.LBB2_3:
mov qword ptr [rsp], rdi # 8-byte Spill
lea rbp, [8*r15]
shl r13, 3
mov r14, qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB2_4: # =>This Inner Loop Header: Depth=1
mov rdi, qword ptr [rbx + rbp]
mov esi, 4
mov edx, 4
call r14
add rbp, 8
cmp r13, rbp
jne .LBB2_4
# %bb.5:
mov rdi, qword ptr [rsp] # 8-byte Reload
mov r13, qword ptr [rdi + 16]
.LBB2_6:
cmp r15, r13
jb .LBB2_17
# %bb.7:
mov rbx, qword ptr [rdi]
.LBB2_8:
mov rcx, qword ptr [r12]
lea rsi, [rcx + 8*r13]
sub r15, r13
test r13, r13
je .LBB2_16
# %bb.9:
lea rdx, [r13 - 1]
mov r8d, r13d
and r8d, 3
cmp rdx, 3
jae .LBB2_11
# %bb.10:
xor edx, edx
jmp .LBB2_13
.LBB2_11:
sub r13, r8
xor edx, edx
.LBB2_12: # =>This Inner Loop Header: Depth=1
mov rax, qword ptr [rbx + 8*rdx]
mov rbp, qword ptr [rcx + 8*rdx]
mov ebp, dword ptr [rbp]
mov dword ptr [rax], ebp
mov rax, qword ptr [rbx + 8*rdx + 8]
mov rbp, qword ptr [rcx + 8*rdx + 8]
mov ebp, dword ptr [rbp]
mov dword ptr [rax], ebp
mov rax, qword ptr [rbx + 8*rdx + 16]
mov rbp, qword ptr [rcx + 8*rdx + 16]
mov ebp, dword ptr [rbp]
mov dword ptr [rax], ebp
mov rax, qword ptr [rbx + 8*rdx + 24]
mov rbp, qword ptr [rcx + 8*rdx + 24]
add rdx, 4
mov ebp, dword ptr [rbp]
mov dword ptr [rax], ebp
cmp r13, rdx
jne .LBB2_12
.LBB2_13:
test r8, r8
je .LBB2_16
# %bb.14:
lea rcx, [rcx + 8*rdx]
lea rdx, [rbx + 8*rdx]
xor eax, eax
.LBB2_15: # =>This Inner Loop Header: Depth=1
mov rbp, qword ptr [rdx + 8*rax]
mov rbx, qword ptr [rcx + 8*rax]
mov ebx, dword ptr [rbx]
mov dword ptr [rbp], ebx
add rax, 1
cmp r8, rax
jne .LBB2_15
.LBB2_16:
mov rdx, r15
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
jmp alloc::vec::Vec<T>::extend_from_slice # TAILCALL
.LBB2_17:
lea rdx, [rip + .L__unnamed_1]
mov rdi, r13
mov rsi, r15
call qword ptr [rip + core::slice::slice_index_len_fail@GOTPCREL]
ud2
# -- End function New: # %bb.0:
push rbp
push r15
push r14
push r13
push r12
push rbx
push rax
mov r12, rsi
mov r15, qword ptr [rsi + 16]
mov r13, qword ptr [rdi + 16]
cmp r13, r15
jbe .LBB3_4
# %bb.1:
mov rbx, qword ptr [rdi]
mov qword ptr [rsp], rdi # 8-byte Spill
mov qword ptr [rdi + 16], r15
lea rbp, [8*r15]
shl r13, 3
mov r14, qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB3_2: # =>This Inner Loop Header: Depth=1
mov rdi, qword ptr [rbx + rbp]
mov esi, 4
mov edx, 4
call r14
add rbp, 8
cmp r13, rbp
jne .LBB3_2
# %bb.3:
mov rdi, qword ptr [rsp] # 8-byte Reload
mov r13, qword ptr [rdi + 16]
.LBB3_4:
mov rdx, r15
sub rdx, r13
jb .LBB3_14
# %bb.5:
mov r9, qword ptr [r12]
lea rsi, [r9 + 8*r13]
test r13, r13
je .LBB3_13
# %bb.6:
mov rcx, qword ptr [rdi]
lea rax, [r13 - 1]
mov r8d, r13d
and r8d, 3
cmp rax, 3
jae .LBB3_8
# %bb.7:
xor eax, eax
jmp .LBB3_10
.LBB3_8:
sub r13, r8
xor eax, eax
.LBB3_9: # =>This Inner Loop Header: Depth=1
mov rbp, qword ptr [rcx + 8*rax]
mov rbx, qword ptr [r9 + 8*rax]
mov ebx, dword ptr [rbx]
mov dword ptr [rbp], ebx
mov rbp, qword ptr [rcx + 8*rax + 8]
mov rbx, qword ptr [r9 + 8*rax + 8]
mov ebx, dword ptr [rbx]
mov dword ptr [rbp], ebx
mov rbp, qword ptr [rcx + 8*rax + 16]
mov rbx, qword ptr [r9 + 8*rax + 16]
mov ebx, dword ptr [rbx]
mov dword ptr [rbp], ebx
mov rbp, qword ptr [rcx + 8*rax + 24]
mov rbx, qword ptr [r9 + 8*rax + 24]
add rax, 4
mov ebx, dword ptr [rbx]
mov dword ptr [rbp], ebx
cmp r13, rax
jne .LBB3_9
.LBB3_10:
test r8, r8
je .LBB3_13
# %bb.11:
lea r9, [r9 + 8*rax]
lea rax, [rcx + 8*rax]
xor ecx, ecx
.LBB3_12: # =>This Inner Loop Header: Depth=1
mov rbp, qword ptr [rax + 8*rcx]
mov rbx, qword ptr [r9 + 8*rcx]
mov ebx, dword ptr [rbx]
mov dword ptr [rbp], ebx
add rcx, 1
cmp r8, rcx
jne .LBB3_12
.LBB3_13:
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
jmp alloc::vec::Vec<T>::extend_from_slice # TAILCALL
.LBB3_14:
lea rdx, [rip + .L__unnamed_1]
mov rdi, r13
mov rsi, r15
call qword ptr [rip + core::slice::slice_index_len_fail@GOTPCREL]
ud2
# -- End function |
As I said before, I'd like to see some evidence that this change is actually beneficial, i.e., makes a non-negligible impact for a somewhat realistic application (or kernel), before we go and introduce more unsafe code (and potentially, subtle behavioral differences e.g. around panic-during-elem-drop). The above assembly excerpt isn't really convincing me the change is worthwhile: is a single extra branch (which happens once, not per element, IIUC) really going to matter when it's accompanied by dropping a bunch of |
I also noticed this in pub fn truncate(&mut self, len: usize) {
// This is safe because:
//
// * the slice passed to `drop_in_place` is valid; the `len > self.len` case avoids
// creating an invalid slice, and
// * the `len` of the vector is shrunk before calling `drop_in_place`, such that no value
// will be dropped twice in case `drop_in_place` were to panic once (if it panics twice,
// the program aborts).
unsafe {
if len > self.len() {
return;
}
let remaining_len = self.len() - len;
let s = ptr::slice_from_raw_parts_mut(self.as_mut_ptr().add(len), remaining_len);
self.buf.set_len(len);
ptr::drop_in_place(s);
}
} I don't know why one needs to continue running the extra code if the length is the same. I saw this while tracing from pub fn resize_with<F>(&mut self, new_len: usize, f: F)
where
F: FnMut() -> T,
{
let len = self.len();
if new_len > len {
self.extend_with(new_len - len, ExtendFunc(f));
} else {
self.truncate(new_len);
}
} In which |
if len > self.len { | ||
if len >= self.len { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ecstatic-morse Can you please check on this? I think this also impacts resize
where it runs truncate even though the length is the same. CC @lzutao
pub fn resize(&mut self, new_len: usize, value: T) {
let len = self.len();
if new_len > len {
self.extend_with(new_len - len, ExtendElement(value))
} else {
self.truncate(new_len); // truncate run the rest of the code even though the length is same
}
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pickfire What would you like me to do exactly? This PR was closed because the reviewer wouldn't sign off on it. Talk to them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ecstatic-morse The reviewer is leaving the team already. See the message on zulip.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's too bad. Still, I don't see what you want from me. While I think this PR was fine as-is and was held to an unusually high standard during the review process, stepping away from the project does not invalidate one's opinion. If you can find someone with r+ rights who feels strongly that some version of these changes should be merged, then I guess we can reopen this, but I'm not going to try to cherry pick a reviewer.
Currently it just calls `truncate(0)`. `truncate()` is (a) not marked as `#[inline]`, and (b) more general than needed for `clear()`. This commit changes `clear()` to do the work itself. This modest change was first proposed in rust-lang#74172, where the reviewer rejected it because there was insufficient evidence that `Vec::clear()`'s performance mattered enough to justify the change. Recent changes within rustc have made `Vec::clear()` hot within `macro_parser.rs`, so the change is now clearly worthwhile. Note that this will also benefit `String::clear()`, because it just calls `Vec::clear()`.
Currently it just calls `truncate(0)`. `truncate()` is (a) not marked as `#[inline]`, and (b) more general than needed for `clear()`. This commit changes `clear()` to do the work itself. This modest change was first proposed in rust-lang#74172, where the reviewer rejected it because there was insufficient evidence that `Vec::clear()`'s performance mattered enough to justify the change. Recent changes within rustc have made `Vec::clear()` hot within `macro_parser.rs`, so the change is now clearly worthwhile. Although it doesn't show wins on CI perf runs, this seems to be because they use PGO. But not all platforms currently use PGO. Also, local builds don't use PGO, and `truncate` sometimes shows up in an over-represented fashion in local profiles. So local profiling will be made easier by this change. Note that this will also benefit `String::clear()`, because it just calls `Vec::clear()`. Finally, the commit removes the `vec-clear.rs` codegen test. It was added in rust-lang#52908. From before then until now, `Vec::clear()` just called `Vec::truncate()` with a zero length. The body of Vec::truncate() has changed a lot since then. Now that `Vec::clear()` is doing actual work itself, and not just calling `Vec::truncate()`, it's not surprising that its generated code includes a load and an icmp. I think it's reasonable to remove this test.
…-ou-se Speed up Vec::clear(). Currently it just calls `truncate(0)`. `truncate()` is (a) not marked as `#[inline]`, and (b) more general than needed for `clear()`. This commit changes `clear()` to do the work itself. This modest change was first proposed in rust-lang#74172, where the reviewer rejected it because there was insufficient evidence that `Vec::clear()`'s performance mattered enough to justify the change. Recent changes within rustc have made `Vec::clear()` hot within `macro_parser.rs`, so the change is now clearly worthwhile. Although it doesn't show wins on CI perf runs, this seems to be because they use PGO. But not all platforms currently use PGO. Also, local builds don't use PGO, and `truncate` sometimes shows up in an over-represented fashion in local profiles. So local profiling will be made easier by this change. Note that this will also benefit `String::clear()`, because it just calls `Vec::clear()`. Finally, the commit removes the `vec-clear.rs` codegen test. It was added in rust-lang#52908. From before then until now, `Vec::clear()` just called `Vec::truncate()` with a zero length. The body of Vec::truncate() has changed a lot since then. Now that `Vec::clear()` is doing actual work itself, and not just calling `Vec::truncate()`, it's not surprising that its generated code includes a load and an icmp. I think it's reasonable to remove this test. r? `@m-ou-se`
truncate
should do nothing for collections if the new length is equal to the current length, but this is not explicitly stated in the code. Instead, control flow continues in this case until an empty slice is passed todrop_in_place
. While this is equivalentand likely optimizes to the same code, it is not as clear and edit: results in slightly worse codegen for vectors containing types withDrop
glue.