whisperfish · rubdos · Jul 8, 2024 · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/build.rs b/build.rs
@@ -18,11 +18,37 @@ fn generate_srgb_lookup() -> [f32; 256] {
     table
 }
 
-fn main() {
+fn write_srgb(f: &mut std::fs::File) {
     let table = generate_srgb_lookup();
+    writeln!(f, "static SRGB_LOOKUP: [f32; 256] = {:?};", table).unwrap();
+}
 
+fn write_base83(f: &mut std::fs::File) {
+    const CHARACTERS: &[u8; 83] =
+        b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";
+    writeln!(f, "const CHARACTERS: [u8; 83] = {:?};", CHARACTERS).unwrap();
+
+    let max_plus_one = CHARACTERS.iter().max().unwrap() + 1;
+    let mut inv_map: [u8; 256] = [max_plus_one; 256];
+    for (i, &c) in CHARACTERS.iter().enumerate() {
+        inv_map[c as usize] = i as u8;
+    }
+    writeln!(
+        f,
+        "const CHARACTERS_INV: [u8; {max_plus_one}] = {:?};",
+        &inv_map[0..max_plus_one as usize]
+    )
+    .unwrap();
+    writeln!(f, "const CHARACTERS_INV_INVALID: u8 = {};", max_plus_one).unwrap();
+}
+
+fn main() {
     let out_dir = std::env::var("OUT_DIR").unwrap();
     let out_dir = std::path::PathBuf::from(out_dir);
+
     let mut f = std::fs::File::create(out_dir.join("srgb_lookup.rs")).unwrap();
-    writeln!(f, "static SRGB_LOOKUP: [f32; 256] = {:?};", table).unwrap();
+    write_srgb(&mut f);
+
+    let mut f = std::fs::File::create(out_dir.join("base83_lookup.rs")).unwrap();
+    write_base83(&mut f);
 }
diff --git a/src/base83.rs b/src/base83.rs
@@ -1,23 +1,12 @@
 use crate::Error;
 
-static CHARACTERS: [u8; 83] = [
-    b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'A', b'B', b'C', b'D', b'E', b'F',
-    b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V',
-    b'W', b'X', b'Y', b'Z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l',
-    b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'#', b'$',
-    b'%', b'*', b'+', b',', b'-', b'.', b':', b';', b'=', b'?', b'@', b'[', b']', b'^', b'_', b'{',
-    b'|', b'}', b'~',
-];
-
-pub fn encode(value: u32, length: u32) -> String {
-    let mut result = String::new();
+include!(concat!(env!("OUT_DIR"), "/base83_lookup.rs"));
 
+pub fn encode_into(value: u32, length: u32, s: &mut String) {
     for i in 1..=length {
         let digit: u32 = (value / u32::pow(83, length - i)) % 83;
-        result.push(CHARACTERS[digit as usize] as char);
+        s.push(CHARACTERS[digit as usize] as char);
     }
-
-    result
 }
 
 pub fn decode(str: &str) -> Result<u64, Error> {
@@ -28,10 +17,13 @@ pub fn decode(str: &str) -> Result<u64, Error> {
     let mut value = 0;
 
     for byte in str.as_bytes() {
-        let digit: usize = CHARACTERS
-            .iter()
-            .position(|r| r == byte)
-            .ok_or(Error::InvalidBase83(*byte))?;
+        if *byte as usize >= CHARACTERS_INV.len() {
+            return Err(Error::InvalidBase83(*byte));
+        }
+        let digit = CHARACTERS_INV[*byte as usize];
+        if digit == CHARACTERS_INV_INVALID {
+            return Err(Error::InvalidBase83(*byte));
+        }
         value = value * 83 + digit as u64;
     }
 
@@ -40,7 +32,13 @@ pub fn decode(str: &str) -> Result<u64, Error> {
 
 #[cfg(test)]
 mod tests {
-    use super::{decode, encode};
+    use super::{decode, encode_into};
+
+    fn encode(value: u32, length: u32) -> String {
+        let mut s = String::new();
+        encode_into(value, length, &mut s);
+        s
+    }
 
     #[test]
     fn encode83() {
@@ -54,6 +52,11 @@ mod tests {
         assert_eq!(v, 6869);
     }
 
+    #[test]
+    fn decode83_too_large() {
+        assert!(decode("€").is_err());
+    }
+
     #[test]
     #[should_panic]
     fn decode83_too_long() {

diff --git a/src/dc.rs b/src/dc.rs
@@ -1,9 +1,9 @@
 use super::util::{linear_to_srgb, srgb_to_linear};
 
 pub fn encode(value: [f32; 3]) -> u32 {
-    let rounded_r = linear_to_srgb(value[0]);
-    let rounded_g = linear_to_srgb(value[1]);
-    let rounded_b = linear_to_srgb(value[2]);
+    let rounded_r = linear_to_srgb(value[0]) as u32;
+    let rounded_g = linear_to_srgb(value[1]) as u32;
+    let rounded_b = linear_to_srgb(value[2]) as u32;
     (rounded_r << 16) + (rounded_g << 8) + rounded_b
 }
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -44,7 +44,8 @@
         return Err(Error::ComponentsOutOfRange);
     }
 
-    let mut factors: Vec<[f32; 3]> = Vec::new();
+    let mut factors: Vec<[f32; 3]> =
+        Vec::with_capacity(components_x as usize * components_y as usize);
 
     for y in 0..components_y {
         for x in 0..components_x {
@@ -56,39 +57,45 @@
     let dc = factors[0];
     let ac = &factors[1..];
 
-    let mut blurhash = String::new();
+    let mut blurhash = String::with_capacity(
+        // 1 byte for size flag
+        1
+        // 1 byte for maximum value
+        + 1
+        // 4 bytes for DC
+        + 4
+        // 2 bytes for each AC
+        + 2 * ac.len(),
+    );
 
     let size_flag = (components_x - 1) + (components_y - 1) * 9;
-    blurhash.push_str(&base83::encode(size_flag, 1));
+    base83::encode_into(size_flag, 1, &mut blurhash);
 
     let maximum_value: f32;
     if !ac.is_empty() {
-        let mut actualmaximum_value = 0.0;
-        for i in 0..components_y * components_x - 1 {
-            actualmaximum_value = f32::max(f32::abs(ac[i as usize][0]), actualmaximum_value);
-            actualmaximum_value = f32::max(f32::abs(ac[i as usize][1]), actualmaximum_value);
-            actualmaximum_value = f32::max(f32::abs(ac[i as usize][2]), actualmaximum_value);
-        }
+        let actualmaximum_value = ac
+            .iter()
+            .flatten()
+            .map(|x| f32::abs(*x))
+            .reduce(f32::max)
+            .unwrap_or(0.0);
 
         let quantised_maximum_value = f32::max(
             0.,
             f32::min(82., f32::floor(actualmaximum_value * 166. - 0.5)),
         ) as u32;
 
         maximum_value = (quantised_maximum_value + 1) as f32 / 166.;
-        blurhash.push_str(&base83::encode(quantised_maximum_value, 1));
+        base83::encode_into(quantised_maximum_value, 1, &mut blurhash);
     } else {
         maximum_value = 1.;
-        blurhash.push_str(&base83::encode(0, 1));
+        base83::encode_into(0, 1, &mut blurhash);
     }
 
-    blurhash.push_str(&base83::encode(dc::encode(dc), 4));
+    base83::encode_into(dc::encode(dc), 4, &mut blurhash);
 
     for i in 0..components_y * components_x - 1 {
-        blurhash.push_str(&base83::encode(
-            ac::encode(ac[i as usize], maximum_value),
-            2,
-        ));
+        base83::encode_into(ac::encode(ac[i as usize], maximum_value), 2, &mut blurhash);
     }
 
     Ok(blurhash)
@@ -111,10 +118,22 @@
 
     let bytes_per_row = width * 4;
 
+    let pi_cx_over_width = PI * component_x as f32 / width as f32;
+    let pi_cy_over_height = PI * component_y as f32 / height as f32;
+
+    let mut cos_pi_cx_over_width = vec![0.; width as usize];
+    for x in 0..width {
+        cos_pi_cx_over_width[x as usize] = f32::cos(pi_cx_over_width * x as f32);
+    }
+
+    let mut cos_pi_cy_over_height = vec![0.; height as usize];
+    for y in 0..height {
+        cos_pi_cy_over_height[y as usize] = f32::cos(pi_cy_over_height * y as f32);
+    }
+
     for y in 0..height {
         for x in 0..width {
-            let basis = f32::cos(PI * component_x as f32 * x as f32 / width as f32)
-                * f32::cos(PI * component_y as f32 * y as f32 / height as f32);
+            let basis = cos_pi_cx_over_width[x as usize] * cos_pi_cy_over_height[y as usize];
             r += basis * srgb_to_linear(rgb[(4 * x + y * bytes_per_row) as usize]);
             g += basis * srgb_to_linear(rgb[(4 * x + 1 + y * bytes_per_row) as usize]);
             b += basis * srgb_to_linear(rgb[(4 * x + 2 + y * bytes_per_row) as usize]);
@@ -164,17 +183,56 @@
         }
     }
 
-    let bytes_per_row = width * 4;
+    let colors: Vec<_> = colors.chunks(num_x).collect();
+
+    let bytes_per_row = width as usize * 4;
+
+    let pi_over_height = PI / height as f32;
+    let pi_over_width = PI / width as f32;
+
+    // Precompute the cosines
+    let mut cos_i_pi_x_over_width = vec![0.; width as usize * num_x];
+    let mut cos_j_pi_y_over_height = vec![0.; height as usize * num_y];
+
+    for x in 0..width {
+        let pi_x_over_width = x as f32 * pi_over_width;
+        for i in 0..num_x {
+            cos_i_pi_x_over_width[x as usize * num_x + i] = f32::cos(pi_x_over_width * i as f32);
+        }
+    }
 
     for y in 0..height {
-        for x in 0..width {
+        let pi_y_over_height = y as f32 * pi_over_height;
+        for j in 0..num_y {
+            cos_j_pi_y_over_height[y as usize * num_y + j] = f32::cos(j as f32 * pi_y_over_height);
+        }
+    }
+
+    // Hint to the optimizer that the length of the slices is correct
+    assert!(height as usize * num_y == cos_j_pi_y_over_height.len());
+    assert!(width as usize * num_x == cos_i_pi_x_over_width.len());
+
+    for y in 0..height as usize {
+        let pixels = &mut pixels[y * bytes_per_row..][..bytes_per_row];
+
+        // More optimizer hints.
+        assert!(y * num_y + num_y <= cos_j_pi_y_over_height.len());
+
+        for x in 0..width as usize {
             let mut pixel = [0.; 3];
 
-            for j in 0..num_y {
-                for i in 0..num_x {
-                    let basis = f32::cos((PI * x as f32 * i as f32) / width as f32)
-                        * f32::cos((PI * y as f32 * j as f32) / height as f32);
-                    let color = &colors[i + j * num_x];
+            let cos_j_pi_y_over_height = &cos_j_pi_y_over_height[y * num_y..][..num_y];
+            let cos_i_pi_x_over_width = &cos_i_pi_x_over_width[x * num_x..][..num_x];
+
+            assert_eq!(cos_j_pi_y_over_height.len(), colors.len());
+            assert_eq!(cos_j_pi_y_over_height.len(), num_y);
+
+            for (cos_j, colors) in cos_j_pi_y_over_height.iter().zip(colors.iter()) {
+                assert_eq!(cos_i_pi_x_over_width.len(), colors.len());
+                assert_eq!(cos_i_pi_x_over_width.len(), num_x);
+
+                for (cos_i, color) in cos_i_pi_x_over_width.iter().zip(colors.iter()) {
+                    let basis = cos_i * cos_j;
 
                     pixel[0] += color[0] * basis;
                     pixel[1] += color[1] * basis;
@@ -186,11 +244,11 @@
             let int_g = linear_to_srgb(pixel[1]);
             let int_b = linear_to_srgb(pixel[2]);
 
-            let pixels = &mut pixels[((4 * x + y * bytes_per_row) as usize)..][..4];
+            let pixels = &mut pixels[4 * x as usize..][..4];
 
-            pixels[0] = int_r as u8;
-            pixels[1] = int_g as u8;
-            pixels[2] = int_b as u8;
+            pixels[0] = int_r;
+            pixels[1] = int_g;
+            pixels[2] = int_b;
             pixels[3] = 255u8;
         }
     }

diff --git a/src/util.rs b/src/util.rs
@@ -1,12 +1,15 @@
 include!(concat!(env!("OUT_DIR"), "/srgb_lookup.rs"));
 
 /// linear 0.0-1.0 floating point to srgb 0-255 integer conversion.
-pub fn linear_to_srgb(value: f32) -> u32 {
+pub fn linear_to_srgb(value: f32) -> u8 {
     let v = f32::max(0., f32::min(1., value));
     if v <= 0.003_130_8 {
-        (v * 12.92 * 255. + 0.5).round() as u32
+        (v * 12.92 * 255. + 0.5).round() as u8
     } else {
-        ((1.055 * f32::powf(v, 1. / 2.4) - 0.055) * 255. + 0.5).round() as u32
+        // The original C implementation uses this formula:
+        // ((1.055 * f32::powf(v, 1. / 2.4) - 0.055) * 255. + 0.5).round() as u8
+        // But we can distribute the latter multiplication, to reduce the number of operations:
+        ((1.055 * 255.) * f32::powf(v, 1. / 2.4) - (0.055 * 255. - 0.5)).round() as u8
     }
 }
 
@@ -15,14 +18,6 @@
     SRGB_LOOKUP[value as usize]
 }
 
-fn sign(n: f32) -> f32 {
-    if n < 0. {
-        -1.
-    } else {
-        1.
-    }
-}
-
 pub fn sign_pow(val: f32, exp: f32) -> f32 {
-    sign(val) * f32::powf(val.abs(), exp)
+    f32::copysign(f32::powf(val.abs(), exp), val)
 }