From 539a3ed8650b645727d6d10227803da3c0dba918 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 17:48:16 +0200
Subject: [PATCH 01/18] Preallocate factors vector

---
 src/lib.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 984b010..1c00f57 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,7 +44,8 @@ pub fn encode(
         return Err(Error::ComponentsOutOfRange);
     }
 
-    let mut factors: Vec<[f32; 3]> = Vec::new();
+    let mut factors: Vec<[f32; 3]> =
+        Vec::with_capacity(components_x as usize * components_y as usize);
 
     for y in 0..components_y {
         for x in 0..components_x {

From a2496c768d805efba1e438955da6e7322161ee64 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 17:54:37 +0200
Subject: [PATCH 02/18] Preallocate whole blurhash string

---
 src/base83.rs | 16 +++++++++-------
 src/lib.rs    | 24 +++++++++++++++---------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/base83.rs b/src/base83.rs
index eb1e756..069970f 100644
--- a/src/base83.rs
+++ b/src/base83.rs
@@ -9,15 +9,11 @@ static CHARACTERS: [u8; 83] = [
     b'|', b'}', b'~',
 ];
 
-pub fn encode(value: u32, length: u32) -> String {
-    let mut result = String::new();
-
+pub fn encode_into(value: u32, length: u32, s: &mut String) {
     for i in 1..=length {
         let digit: u32 = (value / u32::pow(83, length - i)) % 83;
-        result.push(CHARACTERS[digit as usize] as char);
+        s.push(CHARACTERS[digit as usize] as char);
     }
-
-    result
 }
 
 pub fn decode(str: &str) -> Result<u64, Error> {
@@ -40,7 +36,13 @@ pub fn decode(str: &str) -> Result<u64, Error> {
 
 #[cfg(test)]
 mod tests {
-    use super::{decode, encode};
+    use super::{decode, encode_into};
+
+    fn encode(value: u32, length: u32) -> String {
+        let mut s = String::new();
+        encode_into(value, length, &mut s);
+        s
+    }
 
     #[test]
     fn encode83() {
diff --git a/src/lib.rs b/src/lib.rs
index 1c00f57..a8cad2f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -57,10 +57,19 @@ pub fn encode(
     let dc = factors[0];
     let ac = &factors[1..];
 
-    let mut blurhash = String::new();
+    let mut blurhash = String::with_capacity(
+        // 1 byte for size flag
+        1
+        // 1 byte for maximum value
+        + 1
+        // 4 bytes for DC
+        + 4
+        // 2 bytes for each AC
+        + 2 * ac.len(),
+    );
 
     let size_flag = (components_x - 1) + (components_y - 1) * 9;
-    blurhash.push_str(&base83::encode(size_flag, 1));
+    base83::encode_into(size_flag, 1, &mut blurhash);
 
     let maximum_value: f32;
     if !ac.is_empty() {
@@ -77,19 +86,16 @@ pub fn encode(
         ) as u32;
 
         maximum_value = (quantised_maximum_value + 1) as f32 / 166.;
-        blurhash.push_str(&base83::encode(quantised_maximum_value, 1));
+        base83::encode_into(quantised_maximum_value, 1, &mut blurhash);
     } else {
         maximum_value = 1.;
-        blurhash.push_str(&base83::encode(0, 1));
+        base83::encode_into(0, 1, &mut blurhash);
     }
 
-    blurhash.push_str(&base83::encode(dc::encode(dc), 4));
+    base83::encode_into(dc::encode(dc), 4, &mut blurhash);
 
     for i in 0..components_y * components_x - 1 {
-        blurhash.push_str(&base83::encode(
-            ac::encode(ac[i as usize], maximum_value),
-            2,
-        ));
+        base83::encode_into(ac::encode(ac[i as usize], maximum_value), 2, &mut blurhash);
     }
 
     Ok(blurhash)

From 9f4697a1fb1f950aa084b47aac0d23ca38568e60 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 18:11:24 +0200
Subject: [PATCH 03/18] build: Move write_srgb in a function

---
 build.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/build.rs b/build.rs
index e45cae3..4291665 100644
--- a/build.rs
+++ b/build.rs
@@ -18,11 +18,15 @@ fn generate_srgb_lookup() -> [f32; 256] {
     table
 }
 
-fn main() {
+fn write_srgb(f: &mut std::fs::File) {
     let table = generate_srgb_lookup();
+    writeln!(f, "static SRGB_LOOKUP: [f32; 256] = {:?};", table).unwrap();
+}
 
+fn main() {
     let out_dir = std::env::var("OUT_DIR").unwrap();
     let out_dir = std::path::PathBuf::from(out_dir);
+
     let mut f = std::fs::File::create(out_dir.join("srgb_lookup.rs")).unwrap();
-    writeln!(f, "static SRGB_LOOKUP: [f32; 256] = {:?};", table).unwrap();
+    write_srgb(&mut f);
 }

From fdcd7cd3dd6b03a38902c56e9447c3042cff163e Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 18:14:26 +0200
Subject: [PATCH 04/18] Write base83 characters list through build script

---
 build.rs      | 9 +++++++++
 src/base83.rs | 9 +--------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/build.rs b/build.rs
index 4291665..3e75c80 100644
--- a/build.rs
+++ b/build.rs
@@ -23,10 +23,19 @@ fn write_srgb(f: &mut std::fs::File) {
     writeln!(f, "static SRGB_LOOKUP: [f32; 256] = {:?};", table).unwrap();
 }
 
+fn write_base83(f: &mut std::fs::File) {
+    const CHARACTERS: &[u8; 83] =
+        b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";
+    writeln!(f, "const CHARACTERS: [u8; 83] = {:?};", CHARACTERS).unwrap();
+}
+
 fn main() {
     let out_dir = std::env::var("OUT_DIR").unwrap();
     let out_dir = std::path::PathBuf::from(out_dir);
 
     let mut f = std::fs::File::create(out_dir.join("srgb_lookup.rs")).unwrap();
     write_srgb(&mut f);
+
+    let mut f = std::fs::File::create(out_dir.join("base83_lookup.rs")).unwrap();
+    write_base83(&mut f);
 }
diff --git a/src/base83.rs b/src/base83.rs
index 069970f..7fbaf27 100644
--- a/src/base83.rs
+++ b/src/base83.rs
@@ -1,13 +1,6 @@
 use crate::Error;
 
-static CHARACTERS: [u8; 83] = [
-    b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'A', b'B', b'C', b'D', b'E', b'F',
-    b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V',
-    b'W', b'X', b'Y', b'Z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l',
-    b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'#', b'$',
-    b'%', b'*', b'+', b',', b'-', b'.', b':', b';', b'=', b'?', b'@', b'[', b']', b'^', b'_', b'{',
-    b'|', b'}', b'~',
-];
+include!(concat!(env!("OUT_DIR"), "/base83_lookup.rs"));
 
 pub fn encode_into(value: u32, length: u32, s: &mut String) {
     for i in 1..=length {

From 71417afb1657d8c55891d87df88a98271cb89cbd Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 18:30:21 +0200
Subject: [PATCH 05/18] Precompute height/width inverses

---
 src/lib.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index a8cad2f..8b38615 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -173,14 +173,17 @@ pub fn decode_into(
 
     let bytes_per_row = width * 4;
 
+    let height_inv = 1. / height as f32;
+    let width_inv = 1. / width as f32;
+
     for y in 0..height {
         for x in 0..width {
             let mut pixel = [0.; 3];
 
             for j in 0..num_y {
                 for i in 0..num_x {
-                    let basis = f32::cos((PI * x as f32 * i as f32) / width as f32)
-                        * f32::cos((PI * y as f32 * j as f32) / height as f32);
+                    let basis = f32::cos((PI * x as f32 * i as f32) * width_inv)
+                        * f32::cos(PI * y as f32 * j as f32 * height_inv);
                     let color = &colors[i + j * num_x];
 
                     pixel[0] += color[0] * basis;

From 5e18d529c016e795e31e8d2560431a5a7ccfc297 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 18:56:58 +0200
Subject: [PATCH 06/18] Pull pi*x/width and pi*y/height out of hot loop

---
 src/lib.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 8b38615..0e4f0cf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -173,17 +173,19 @@ pub fn decode_into(
 
     let bytes_per_row = width * 4;
 
-    let height_inv = 1. / height as f32;
-    let width_inv = 1. / width as f32;
+    let pi_over_height = PI / height as f32;
+    let pi_over_width = PI / width as f32;
 
     for y in 0..height {
+        let pi_y_over_height = PI * y as f32 * pi_over_height;
         for x in 0..width {
+            let pi_x_over_width = PI * x as f32 * pi_over_width;
             let mut pixel = [0.; 3];
 
             for j in 0..num_y {
                 for i in 0..num_x {
-                    let basis = f32::cos((PI * x as f32 * i as f32) * width_inv)
-                        * f32::cos(PI * y as f32 * j as f32 * height_inv);
+                    let basis = f32::cos(i as f32 * pi_x_over_width)
+                        * f32::cos(j as f32 * pi_y_over_height);
                     let color = &colors[i + j * num_x];
 
                     pixel[0] += color[0] * basis;

From 7ad8f2e1fd0759fc5ec95741cbed86fb41b73458 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 19:01:25 +0200
Subject: [PATCH 07/18] Generate base83 inverse character map

---
 build.rs      | 13 +++++++++++++
 src/base83.rs | 16 ++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/build.rs b/build.rs
index 3e75c80..7601ba3 100644
--- a/build.rs
+++ b/build.rs
@@ -27,6 +27,19 @@ fn write_base83(f: &mut std::fs::File) {
     const CHARACTERS: &[u8; 83] =
         b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";
     writeln!(f, "const CHARACTERS: [u8; 83] = {:?};", CHARACTERS).unwrap();
+
+    let max_plus_one = CHARACTERS.iter().max().unwrap() + 1;
+    let mut inv_map: [u8; 256] = [max_plus_one; 256];
+    for (i, &c) in CHARACTERS.iter().enumerate() {
+        inv_map[c as usize] = i as u8;
+    }
+    writeln!(
+        f,
+        "const CHARACTERS_INV: [u8; {max_plus_one}] = {:?};",
+        &inv_map[0..max_plus_one as usize]
+    )
+    .unwrap();
+    writeln!(f, "const CHARACTERS_INV_INVALID: u8 = {};", max_plus_one).unwrap();
 }
 
 fn main() {
diff --git a/src/base83.rs b/src/base83.rs
index 7fbaf27..1593679 100644
--- a/src/base83.rs
+++ b/src/base83.rs
@@ -17,10 +17,13 @@ pub fn decode(str: &str) -> Result<u64, Error> {
     let mut value = 0;
 
     for byte in str.as_bytes() {
-        let digit: usize = CHARACTERS
-            .iter()
-            .position(|r| r == byte)
-            .ok_or(Error::InvalidBase83(*byte))?;
+        if *byte as usize >= CHARACTERS_INV.len() {
+            return Err(Error::InvalidBase83(*byte));
+        }
+        let digit = CHARACTERS_INV[*byte as usize];
+        if digit == CHARACTERS_INV_INVALID {
+            return Err(Error::InvalidBase83(*byte));
+        }
         value = value * 83 + digit as u64;
     }
 
@@ -49,6 +52,11 @@ mod tests {
         assert_eq!(v, 6869);
     }
 
+    #[test]
+    fn decode83_too_large() {
+        assert!(decode("€").is_err());
+    }
+
     #[test]
     #[should_panic]
     fn decode83_too_long() {

From 08a04bc680ee3d3cb54b37a74257aa7727d14bbd Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 19:12:01 +0200
Subject: [PATCH 08/18] Remove a multiplication for sign_pow

---
 src/util.rs | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/util.rs b/src/util.rs
index 6e17108..78bb911 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -15,14 +15,11 @@ pub fn srgb_to_linear(value: u8) -> f32 {
     SRGB_LOOKUP[value as usize]
 }
 
-fn sign(n: f32) -> f32 {
-    if n < 0. {
-        -1.
+pub fn sign_pow(val: f32, exp: f32) -> f32 {
+    let t = f32::powf(val.abs(), exp);
+    if val < 0. {
+        -t
     } else {
-        1.
+        t
     }
 }
-
-pub fn sign_pow(val: f32, exp: f32) -> f32 {
-    sign(val) * f32::powf(val.abs(), exp)
-}

From f26590c945ac985c5a54fadd4f6f9e972e3bad62 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Sun, 7 Jul 2024 19:12:33 +0200
Subject: [PATCH 09/18] Reduce return size for linear_to_srgb

---
 src/dc.rs   | 6 +++---
 src/lib.rs  | 6 +++---
 src/util.rs | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/dc.rs b/src/dc.rs
index 8681b5c..6414dad 100644
--- a/src/dc.rs
+++ b/src/dc.rs
@@ -1,9 +1,9 @@
 use super::util::{linear_to_srgb, srgb_to_linear};
 
 pub fn encode(value: [f32; 3]) -> u32 {
-    let rounded_r = linear_to_srgb(value[0]);
-    let rounded_g = linear_to_srgb(value[1]);
-    let rounded_b = linear_to_srgb(value[2]);
+    let rounded_r = linear_to_srgb(value[0]) as u32;
+    let rounded_g = linear_to_srgb(value[1]) as u32;
+    let rounded_b = linear_to_srgb(value[2]) as u32;
     (rounded_r << 16) + (rounded_g << 8) + rounded_b
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 0e4f0cf..79d563b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -200,9 +200,9 @@ pub fn decode_into(
 
             let pixels = &mut pixels[((4 * x + y * bytes_per_row) as usize)..][..4];
 
-            pixels[0] = int_r as u8;
-            pixels[1] = int_g as u8;
-            pixels[2] = int_b as u8;
+            pixels[0] = int_r;
+            pixels[1] = int_g;
+            pixels[2] = int_b;
             pixels[3] = 255u8;
         }
     }
diff --git a/src/util.rs b/src/util.rs
index 78bb911..0fc5647 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -1,12 +1,12 @@
 include!(concat!(env!("OUT_DIR"), "/srgb_lookup.rs"));
 
 /// linear 0.0-1.0 floating point to srgb 0-255 integer conversion.
-pub fn linear_to_srgb(value: f32) -> u32 {
+pub fn linear_to_srgb(value: f32) -> u8 {
     let v = f32::max(0., f32::min(1., value));
     if v <= 0.003_130_8 {
-        (v * 12.92 * 255. + 0.5).round() as u32
+        (v * 12.92 * 255. + 0.5).round() as u8
     } else {
-        ((1.055 * f32::powf(v, 1. / 2.4) - 0.055) * 255. + 0.5).round() as u32
+        ((1.055 * f32::powf(v, 1. / 2.4) - 0.055) * 255. + 0.5).round() as u8
     }
 }
 

From c44d6058267ff2461aa8739b50c15aeea8de022a Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 10:14:26 +0200
Subject: [PATCH 10/18] Precompute cosines outside of hot loop

---
 src/lib.rs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 79d563b..e1b7dc1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -176,16 +176,26 @@ pub fn decode_into(
     let pi_over_height = PI / height as f32;
     let pi_over_width = PI / width as f32;
 
+    let mut pi_x_over_width = vec![0.; width as usize];
+    for x in 0..width {
+        pi_x_over_width[x as usize] = x as f32 * pi_over_width as f32;
+    }
+
+    let mut cos_i_pi_x_over_width = vec![0.; num_x];
+
     for y in 0..height {
         let pi_y_over_height = PI * y as f32 * pi_over_height;
         for x in 0..width {
-            let pi_x_over_width = PI * x as f32 * pi_over_width;
             let mut pixel = [0.; 3];
 
+            for i in 0..num_x {
+                cos_i_pi_x_over_width[i] = f32::cos(pi_x_over_width[x as usize] * i as f32);
+            }
+
             for j in 0..num_y {
+                let cos_j_pi_y_over_height = f32::cos(j as f32 * pi_y_over_height);
                 for i in 0..num_x {
-                    let basis = f32::cos(i as f32 * pi_x_over_width)
-                        * f32::cos(j as f32 * pi_y_over_height);
+                    let basis = cos_i_pi_x_over_width[i] * cos_j_pi_y_over_height;
                     let color = &colors[i + j * num_x];
 
                     pixel[0] += color[0] * basis;

From 428d553b7d75532b1102abb2e560603f832472b1 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 10:40:25 +0200
Subject: [PATCH 11/18] Pull pi_x_width/pi_y_height out of hot loop

---
 src/lib.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index e1b7dc1..ce31b07 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -118,10 +118,13 @@ fn multiply_basis_function(
 
     let bytes_per_row = width * 4;
 
+    let pi_cx_over_width = PI * component_x as f32 / width as f32;
+    let pi_cy_over_height = PI * component_y as f32 / height as f32;
+
     for y in 0..height {
         for x in 0..width {
-            let basis = f32::cos(PI * component_x as f32 * x as f32 / width as f32)
-                * f32::cos(PI * component_y as f32 * y as f32 / height as f32);
+            let basis =
+                f32::cos(pi_cx_over_width * x as f32) * f32::cos(pi_cy_over_height * y as f32);
             r += basis * srgb_to_linear(rgb[(4 * x + y * bytes_per_row) as usize]);
             g += basis * srgb_to_linear(rgb[(4 * x + 1 + y * bytes_per_row) as usize]);
             b += basis * srgb_to_linear(rgb[(4 * x + 2 + y * bytes_per_row) as usize]);

From bd7d2f8e4c7fad535c440374705f357b5b7ccb1c Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 10:40:36 +0200
Subject: [PATCH 12/18] Use precomputation tables for cosines in decode

---
 src/lib.rs | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index ce31b07..00837f4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -179,26 +179,32 @@ pub fn decode_into(
     let pi_over_height = PI / height as f32;
     let pi_over_width = PI / width as f32;
 
-    let mut pi_x_over_width = vec![0.; width as usize];
+    // Precompute the cosines
+    let mut cos_i_pi_x_over_width = vec![0.; width as usize * num_x];
+    let mut cos_j_pi_y_over_height = vec![0.; height as usize * num_y];
+
     for x in 0..width {
-        pi_x_over_width[x as usize] = x as f32 * pi_over_width as f32;
+        let pi_x_over_width = x as f32 * pi_over_width;
+        for i in 0..num_x {
+            cos_i_pi_x_over_width[x as usize * num_x + i] = f32::cos(pi_x_over_width * i as f32);
+        }
     }
 
-    let mut cos_i_pi_x_over_width = vec![0.; num_x];
+    for y in 0..height {
+        let pi_y_over_height = y as f32 * pi_over_height;
+        for j in 0..num_y {
+            cos_j_pi_y_over_height[y as usize * num_y + j] = f32::cos(j as f32 * pi_y_over_height);
+        }
+    }
 
     for y in 0..height {
-        let pi_y_over_height = PI * y as f32 * pi_over_height;
         for x in 0..width {
             let mut pixel = [0.; 3];
 
-            for i in 0..num_x {
-                cos_i_pi_x_over_width[i] = f32::cos(pi_x_over_width[x as usize] * i as f32);
-            }
-
             for j in 0..num_y {
-                let cos_j_pi_y_over_height = f32::cos(j as f32 * pi_y_over_height);
                 for i in 0..num_x {
-                    let basis = cos_i_pi_x_over_width[i] * cos_j_pi_y_over_height;
+                    let basis = cos_i_pi_x_over_width[x as usize * num_x + i]
+                        * cos_j_pi_y_over_height[y as usize * num_y + j];
                     let color = &colors[i + j * num_x];
 
                     pixel[0] += color[0] * basis;

From ab0520e1a35685c900aaef3e73816aa0c4825169 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 11:04:45 +0200
Subject: [PATCH 13/18] Use subslice for accessing cosine table

Co-authored-by: Thibaut Vandervelden <thvdveld@vub.be>
---
 src/lib.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 00837f4..07e106f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -201,11 +201,14 @@ pub fn decode_into(
         for x in 0..width {
             let mut pixel = [0.; 3];
 
+            let cos_i_pi_x_over_width = &cos_i_pi_x_over_width[x as usize * num_x..][..num_x];
+            let cos_j_pi_y_over_height = &cos_j_pi_y_over_height[y as usize * num_y..][..num_y];
             for j in 0..num_y {
+                let colors = &colors[j * num_x..][..num_x];
+
                 for i in 0..num_x {
-                    let basis = cos_i_pi_x_over_width[x as usize * num_x + i]
-                        * cos_j_pi_y_over_height[y as usize * num_y + j];
-                    let color = &colors[i + j * num_x];
+                    let basis = cos_i_pi_x_over_width[i] * cos_j_pi_y_over_height[j];
+                    let color = &colors[i];
 
                     pixel[0] += color[0] * basis;
                     pixel[1] += color[1] * basis;

From 8efa5cfe32cdec5efa4a40fe312da6c8dd2ea152 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 11:12:52 +0200
Subject: [PATCH 14/18] Use f32::copysign instead of manual branched assignment

---
 src/util.rs | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/util.rs b/src/util.rs
index 0fc5647..53fd5c7 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -16,10 +16,5 @@ pub fn srgb_to_linear(value: u8) -> f32 {
 }
 
 pub fn sign_pow(val: f32, exp: f32) -> f32 {
-    let t = f32::powf(val.abs(), exp);
-    if val < 0. {
-        -t
-    } else {
-        t
-    }
+    f32::copysign(f32::powf(val.abs(), exp), val)
 }

From f49f21d249fb5e12965671cf76b5beea6df77c4d Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 11:18:42 +0200
Subject: [PATCH 15/18] Optimization in linear_to_srgb

---
 src/util.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/util.rs b/src/util.rs
index 53fd5c7..b79e13c 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -6,7 +6,10 @@ pub fn linear_to_srgb(value: f32) -> u8 {
     if v <= 0.003_130_8 {
         (v * 12.92 * 255. + 0.5).round() as u8
     } else {
-        ((1.055 * f32::powf(v, 1. / 2.4) - 0.055) * 255. + 0.5).round() as u8
+        // The original C implementation uses this formula:
+        // ((1.055 * f32::powf(v, 1. / 2.4) - 0.055) * 255. + 0.5).round() as u8
+        // But we can distribute the latter multiplication, to reduce the number of operations:
+        ((1.055 * 255.) * f32::powf(v, 1. / 2.4) - (0.055 * 255. - 0.5)).round() as u8
     }
 }
 

From 1cca59ddb3cd8836e81b89feffcca0c87f216b5f Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 11:29:29 +0200
Subject: [PATCH 16/18] Faster computation of maximum

---
 src/lib.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 07e106f..cf411c9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -73,12 +73,12 @@ pub fn encode(
 
     let maximum_value: f32;
     if !ac.is_empty() {
-        let mut actualmaximum_value = 0.0;
-        for i in 0..components_y * components_x - 1 {
-            actualmaximum_value = f32::max(f32::abs(ac[i as usize][0]), actualmaximum_value);
-            actualmaximum_value = f32::max(f32::abs(ac[i as usize][1]), actualmaximum_value);
-            actualmaximum_value = f32::max(f32::abs(ac[i as usize][2]), actualmaximum_value);
-        }
+        let actualmaximum_value = ac
+            .iter()
+            .flatten()
+            .map(|x| f32::abs(*x))
+            .reduce(f32::max)
+            .unwrap_or(0.0);
 
         let quantised_maximum_value = f32::max(
             0.,

From a89f1316e9b89e4941ebe0c5c9e2560f50ee3e8c Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 11:34:54 +0200
Subject: [PATCH 17/18] Encode: precompute cosines

---
 src/lib.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index cf411c9..f7a9efb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -121,10 +121,19 @@ fn multiply_basis_function(
     let pi_cx_over_width = PI * component_x as f32 / width as f32;
     let pi_cy_over_height = PI * component_y as f32 / height as f32;
 
+    let mut cos_pi_cx_over_width = vec![0.; width as usize];
+    for x in 0..width {
+        cos_pi_cx_over_width[x as usize] = f32::cos(pi_cx_over_width * x as f32);
+    }
+
+    let mut cos_pi_cy_over_height = vec![0.; height as usize];
+    for y in 0..height {
+        cos_pi_cy_over_height[y as usize] = f32::cos(pi_cy_over_height * y as f32);
+    }
+
     for y in 0..height {
         for x in 0..width {
-            let basis =
-                f32::cos(pi_cx_over_width * x as f32) * f32::cos(pi_cy_over_height * y as f32);
+            let basis = cos_pi_cx_over_width[x as usize] * cos_pi_cy_over_height[y as usize];
             r += basis * srgb_to_linear(rgb[(4 * x + y * bytes_per_row) as usize]);
             g += basis * srgb_to_linear(rgb[(4 * x + 1 + y * bytes_per_row) as usize]);
             b += basis * srgb_to_linear(rgb[(4 * x + 2 + y * bytes_per_row) as usize]);

From 73fa250bff1f6b753cc33b2baff84ce7f46e3aa8 Mon Sep 17 00:00:00 2001
From: Ruben De Smet <ruben.de.smet@rubdos.be>
Date: Mon, 8 Jul 2024 14:10:13 +0200
Subject: [PATCH 18/18] asserts as hints for the optimizer, use zip as iterator

---
 src/lib.rs | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f7a9efb..5975b0a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -183,7 +183,9 @@ pub fn decode_into(
         }
     }
 
-    let bytes_per_row = width * 4;
+    let colors: Vec<_> = colors.chunks(num_x).collect();
+
+    let bytes_per_row = width as usize * 4;
 
     let pi_over_height = PI / height as f32;
     let pi_over_width = PI / width as f32;
@@ -206,18 +208,31 @@ pub fn decode_into(
         }
     }
 
-    for y in 0..height {
-        for x in 0..width {
+    // Hint to the optimizer that the length of the slices is correct
+    assert!(height as usize * num_y == cos_j_pi_y_over_height.len());
+    assert!(width as usize * num_x == cos_i_pi_x_over_width.len());
+
+    for y in 0..height as usize {
+        let pixels = &mut pixels[y * bytes_per_row..][..bytes_per_row];
+
+        // More optimizer hints.
+        assert!(y * num_y + num_y <= cos_j_pi_y_over_height.len());
+
+        for x in 0..width as usize {
             let mut pixel = [0.; 3];
 
-            let cos_i_pi_x_over_width = &cos_i_pi_x_over_width[x as usize * num_x..][..num_x];
-            let cos_j_pi_y_over_height = &cos_j_pi_y_over_height[y as usize * num_y..][..num_y];
-            for j in 0..num_y {
-                let colors = &colors[j * num_x..][..num_x];
+            let cos_j_pi_y_over_height = &cos_j_pi_y_over_height[y * num_y..][..num_y];
+            let cos_i_pi_x_over_width = &cos_i_pi_x_over_width[x * num_x..][..num_x];
+
+            assert_eq!(cos_j_pi_y_over_height.len(), colors.len());
+            assert_eq!(cos_j_pi_y_over_height.len(), num_y);
+
+            for (cos_j, colors) in cos_j_pi_y_over_height.iter().zip(colors.iter()) {
+                assert_eq!(cos_i_pi_x_over_width.len(), colors.len());
+                assert_eq!(cos_i_pi_x_over_width.len(), num_x);
 
-                for i in 0..num_x {
-                    let basis = cos_i_pi_x_over_width[i] * cos_j_pi_y_over_height[j];
-                    let color = &colors[i];
+                for (cos_i, color) in cos_i_pi_x_over_width.iter().zip(colors.iter()) {
+                    let basis = cos_i * cos_j;
 
                     pixel[0] += color[0] * basis;
                     pixel[1] += color[1] * basis;
@@ -229,7 +244,7 @@ pub fn decode_into(
             let int_g = linear_to_srgb(pixel[1]);
             let int_b = linear_to_srgb(pixel[2]);
 
-            let pixels = &mut pixels[((4 * x + y * bytes_per_row) as usize)..][..4];
+            let pixels = &mut pixels[4 * x as usize..][..4];
 
             pixels[0] = int_r;
             pixels[1] = int_g;