From 11b74ddf483633f672ca7038d5a1be69b04a01e3 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@gmail.com>
Date: Tue, 31 Jan 2023 20:31:09 +0800
Subject: [PATCH] faster duplicate_overlapping

improve duplicate_overlapping unsafe version. The compiler generates unfavourable assembly for the simple version.
Now we copy 4 bytes, instead of one in every iteration.
Without that the compiler will unroll/auto-vectorize the copy with a lot of branches.
This is not what we want, as large overlapping copies are not that common.
---
 .gitignore              |  1 +
 benches/crit_bench.rs   | 10 +++++-----
 src/block/decompress.rs | 12 +++++++-----
 3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/.gitignore b/.gitignore
index c05bc5dc..c04809ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 /target
+benchmarks/target
 Cargo.lock
 my-prof.profile
 Session.vim
diff --git a/benches/crit_bench.rs b/benches/crit_bench.rs
index 750dc0cb..2fe3a05f 100644
--- a/benches/crit_bench.rs
+++ b/benches/crit_bench.rs
@@ -17,12 +17,12 @@ const COMPRESSION10MB: &[u8] = include_bytes!("dickens.txt");
 const COMPRESSION95K_VERY_GOOD_LOGO: &[u8] = include_bytes!("../logo.jpg");
 
 const ALL: &[&[u8]] = &[
-    //COMPRESSION1K as &[u8],
-    //COMPRESSION34K as &[u8],
-    //COMPRESSION65K as &[u8],
-    //COMPRESSION66K as &[u8],
+    COMPRESSION1K as &[u8],
+    COMPRESSION34K as &[u8],
+    COMPRESSION65K as &[u8],
+    COMPRESSION66K as &[u8],
     COMPRESSION10MB as &[u8],
-    // COMPRESSION95K_VERY_GOOD_LOGO as &[u8],
+    COMPRESSION95K_VERY_GOOD_LOGO as &[u8],
 ];
 
 fn compress_lz4_fear(input: &[u8]) -> Vec<u8> {
diff --git a/src/block/decompress.rs b/src/block/decompress.rs
index 9fda67a8..dd0ced8a 100644
--- a/src/block/decompress.rs
+++ b/src/block/decompress.rs
@@ -57,12 +57,14 @@ unsafe fn duplicate_overlapping(
     // To prevent that we write a dummy zero to output, which will zero out output in such cases.
     // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772
     output_ptr.write(0u8);
-    // Note: this looks like a harmless loop but is unrolled/auto-vectorized by the compiler
-    for _ in 0..match_length {
-        let curr = start.read();
-        output_ptr.write(curr);
-        *output_ptr = output_ptr.add(1);
+    let dst_ptr_end = output_ptr.add(match_length);
+    while (*output_ptr as usize) < dst_ptr_end as usize {
+        // Note that we copy 4 bytes, instead of one.
+        // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches.
+        // This is not what we want, as large overlapping copies are not that common.
+        core::ptr::copy(start, *output_ptr, 4);
         start = start.add(1);
+        *output_ptr = output_ptr.add(1);
     }
 }