From 11b74ddf483633f672ca7038d5a1be69b04a01e3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 31 Jan 2023 20:31:09 +0800 Subject: [PATCH] faster duplicate_overlapping improve duplicate_overlapping unsafe version. The compiler generates unfavourable assembly for the simple version. Now we copy 4 bytes, instead of one in every iteration. Without that the compiler will unroll/auto-vectorize the copy with a lot of branches. This is not what we want, as large overlapping copies are not that common. --- .gitignore | 1 + benches/crit_bench.rs | 10 +++++----- src/block/decompress.rs | 12 +++++++----- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index c05bc5dc..c04809ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target +benchmarks/target Cargo.lock my-prof.profile Session.vim diff --git a/benches/crit_bench.rs b/benches/crit_bench.rs index 750dc0cb..2fe3a05f 100644 --- a/benches/crit_bench.rs +++ b/benches/crit_bench.rs @@ -17,12 +17,12 @@ const COMPRESSION10MB: &[u8] = include_bytes!("dickens.txt"); const COMPRESSION95K_VERY_GOOD_LOGO: &[u8] = include_bytes!("../logo.jpg"); const ALL: &[&[u8]] = &[ - //COMPRESSION1K as &[u8], - //COMPRESSION34K as &[u8], - //COMPRESSION65K as &[u8], - //COMPRESSION66K as &[u8], + COMPRESSION1K as &[u8], + COMPRESSION34K as &[u8], + COMPRESSION65K as &[u8], + COMPRESSION66K as &[u8], COMPRESSION10MB as &[u8], - // COMPRESSION95K_VERY_GOOD_LOGO as &[u8], + COMPRESSION95K_VERY_GOOD_LOGO as &[u8], ]; fn compress_lz4_fear(input: &[u8]) -> Vec { diff --git a/src/block/decompress.rs b/src/block/decompress.rs index 9fda67a8..dd0ced8a 100644 --- a/src/block/decompress.rs +++ b/src/block/decompress.rs @@ -57,12 +57,14 @@ unsafe fn duplicate_overlapping( // To prevent that we write a dummy zero to output, which will zero out output in such cases. // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772 output_ptr.write(0u8); - // Note: this looks like a harmless loop but is unrolled/auto-vectorized by the compiler - for _ in 0..match_length { - let curr = start.read(); - output_ptr.write(curr); - *output_ptr = output_ptr.add(1); + let dst_ptr_end = output_ptr.add(match_length); + while (*output_ptr as usize) < dst_ptr_end as usize { + // Note that we copy 4 bytes, instead of one. + // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches. + // This is not what we want, as large overlapping copies are not that common. + core::ptr::copy(start, *output_ptr, 4); start = start.add(1); + *output_ptr = output_ptr.add(1); } }