Clean compression impl (#28)

* Remove `as` casting while compressing * Remove `dict_size` * Move some floating numbers to constants * fmt * Make input buffer part of `CompressContext` * Remove wc from ctx * Use slices instead of vecs for compression * Make `compress_internal` operate on arrays * Make `dictionary_to_create` use chars instead of slices * Preallocate more space for output * Change `enlarge_in` size
adumbidiot · Oct 20, 2022 · 50b0435 · 50b0435
1 parent 735e012
commit 50b0435
Show file tree

Hide file tree

Showing 3 changed files with 134 additions and 76 deletions.
diff --git a/src/compress.rs b/src/compress.rs
@@ -4,15 +4,50 @@ use crate::constants::URI_KEY;
 use crate::IntoWideIter;
 use std::collections::HashMap;
 use std::collections::HashSet;
+use std::convert::TryInto;
+
+/// The starting size of a codepoint.
+///
+/// Compression starts with the following codes:
+/// 0: u8
+/// 1: u16
+/// 2: close stream
+const START_NUM_BITS: u8 = 2;
+
+/// The stream code for a `u8`.
+const U8_CODE: u32 = 0;
+
+/// The stream code for a `u16`.
+const U16_CODE: u32 = 1;
+
+/// The number of "base codes",
+/// the default codes of all streams.
+///
+/// These are U8_CODE, U16_CODE, and CLOSE_CODE.
+const NUM_BASE_CODES: usize = 3;
 
 #[derive(Debug)]
-pub(crate) struct CompressContext<F> {
-    dictionary: HashMap<Vec<u16>, u32>,
-    dictionary_to_create: HashSet<Vec<u16>>,
-    wc: Vec<u16>,
-    w: Vec<u16>,
-    enlarge_in: usize,
-    dict_size: usize,
+pub(crate) struct CompressContext<'a, F> {
+    dictionary: HashMap<&'a [u16], u32>,
+    dictionary_to_create: HashSet<u16>,
+
+    /// The current word, w,
+    /// in terms of indexes into the input.
+    w_start_idx: usize,
+    w_end_idx: usize,
+
+    // The counter for increasing the current number of bits in a code.
+    // The max size of this is 1 << max(num_bits) == 1 + u32::MAX, so we use u64.
+    enlarge_in: u64,
+
+    /// The input buffer.
+    input: &'a [u16],
+
+    /// The output buffer.
+    output: Vec<u16>,
+
+    /// The bit buffer.
+    bit_buffer: u16,
 
     /// The current number of bits in a code.
     ///
@@ -21,12 +56,6 @@ pub(crate) struct CompressContext<F> {
     /// 32 < u8::MAX
     num_bits: u8,
 
-    // result: Vec<u16>,
-
-    // Data
-    output: Vec<u16>,
-    val: u16,
-
     /// The current bit position.
     bit_position: u8,
 
@@ -41,30 +70,33 @@ pub(crate) struct CompressContext<F> {
     to_char: F,
 }
 
-impl<F> CompressContext<F>
+impl<'a, F> CompressContext<'a, F>
 where
     F: Fn(u16) -> u16,
 {
     /// Make a new [`CompressContext`].
     ///
     /// # Panics
-    /// Panics if `bits_per_char` exceeds 16.
+    /// Panics if `bits_per_char` exceeds the number of bits in a u16.
     #[inline]
-    pub fn new(bits_per_char: u8, to_char: F) -> Self {
-        assert!(bits_per_char <= 16);
+    pub fn new(input: &'a [u16], bits_per_char: u8, to_char: F) -> Self {
+        assert!(usize::from(bits_per_char) <= std::mem::size_of::<u16>() * 8);
 
         CompressContext {
             dictionary: HashMap::with_capacity(16),
             dictionary_to_create: HashSet::with_capacity(16),
-            wc: Vec::new(),
-            w: Vec::new(),
+
+            w_start_idx: 0,
+            w_end_idx: 0,
+
             enlarge_in: 2,
-            dict_size: 3,
-            num_bits: 2,
 
-            // result: Vec::new(),
-            output: Vec::new(),
-            val: 0,
+            input,
+            output: Vec::with_capacity(input.len() >> 1), // Lowball, assume we can get a 50% reduction in size.
+
+            bit_buffer: 0,
+
+            num_bits: START_NUM_BITS,
 
             bit_position: 0,
             bits_per_char,
@@ -74,39 +106,48 @@ where
 
     #[inline]
     pub fn produce_w(&mut self) {
-        if self.dictionary_to_create.contains(&self.w) {
-            let first_w_char = self.w[0];
-            if first_w_char < 256 {
-                self.write_bits(self.num_bits, 0);
-                self.write_bits(8, first_w_char.into());
-            } else {
-                self.write_bits(self.num_bits, 1);
-                self.write_bits(16, first_w_char.into());
+        let w = &self.input[self.w_start_idx..self.w_end_idx];
+
+        match w
+            .first()
+            .map(|first_w_char| self.dictionary_to_create.take(first_w_char))
+        {
+            Some(Some(first_w_char)) => {
+                if first_w_char < 256 {
+                    self.write_bits(self.num_bits, U8_CODE);
+                    self.write_bits(8, first_w_char.into());
+                } else {
+                    self.write_bits(self.num_bits, U16_CODE);
+                    self.write_bits(16, first_w_char.into());
+                }
+                self.decrement_enlarge_in();
+            }
+            None | Some(None) => {
+                self.write_bits(self.num_bits, *self.dictionary.get(w).unwrap());
             }
-            self.decrement_enlarge_in();
-            self.dictionary_to_create.remove(&self.w);
-        } else {
-            self.write_bits(self.num_bits, *self.dictionary.get(&self.w).unwrap());
         }
         self.decrement_enlarge_in();
     }
 
+    /// Append the bit to the bit buffer.
     #[inline]
-    pub fn write_bit(&mut self, value: u32) {
-        self.val = (self.val << 1) | (value as u16);
+    pub fn write_bit(&mut self, bit: bool) {
+        self.bit_buffer = (self.bit_buffer << 1) | u16::from(bit);
         self.bit_position += 1;
+
         if self.bit_position == self.bits_per_char {
             self.bit_position = 0;
-            let char_data = (self.to_char)(self.val);
-            self.output.push(char_data);
-            self.val = 0;
+            let output_char = (self.to_char)(self.bit_buffer);
+            self.bit_buffer = 0;
+
+            self.output.push(output_char);
         }
     }
 
     #[inline]
     pub fn write_bits(&mut self, n: u8, mut value: u32) {
         for _ in 0..n {
-            self.write_bit(value & 1);
+            self.write_bit(value & 1 == 1);
             value >>= 1;
         }
     }
@@ -115,40 +156,49 @@ where
     pub fn decrement_enlarge_in(&mut self) {
         self.enlarge_in -= 1;
         if self.enlarge_in == 0 {
-            self.enlarge_in = 2_usize.pow(self.num_bits.into());
+            self.enlarge_in = 1 << self.num_bits;
             self.num_bits += 1;
         }
     }
 
     /// Compress a `u16`. This represents a wide char.
     #[inline]
-    pub fn write_u16(&mut self, c: u16) {
-        let c = vec![c];
-        if !self.dictionary.contains_key(&c) {
-            self.dictionary.insert(c.clone(), self.dict_size as u32);
-            self.dict_size += 1;
-            self.dictionary_to_create.insert(c.clone());
+    pub fn write_u16(&mut self, i: usize) {
+        let c = &self.input[i];
+        if !self.dictionary.contains_key(std::slice::from_ref(c)) {
+            self.dictionary.insert(
+                std::slice::from_ref(c),
+                (self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(),
+            );
+            self.dictionary_to_create.insert(*c);
         }
 
-        self.wc = self.w.clone();
-        self.wc.extend(&c);
-        if self.dictionary.contains_key(&self.wc) {
-            self.w = std::mem::take(&mut self.wc);
+        // wc = w + c.
+        let wc = &self.input[self.w_start_idx..self.w_end_idx + 1];
+        if self.dictionary.contains_key(wc) {
+            // w = wc.
+            self.w_end_idx += 1;
         } else {
             self.produce_w();
             // Add wc to the dictionary.
-            self.dictionary
-                .insert(self.wc.clone(), self.dict_size as u32);
-            self.dict_size += 1;
-            self.w = c;
+            self.dictionary.insert(
+                wc,
+                (self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(),
+            );
+
+            // w = c.
+            self.w_start_idx = i;
+            self.w_end_idx = i + 1;
         }
     }
 
     /// Finish the stream and get the final result.
     #[inline]
     pub fn finish(mut self) -> Vec<u16> {
+        let w = &self.input[self.w_start_idx..self.w_end_idx];
+
         // Output the code for w.
-        if !self.w.is_empty() {
+        if !w.is_empty() {
             self.produce_w();
         }
 
@@ -158,27 +208,37 @@ where
         let str_len = self.output.len();
         // Flush the last char
         while self.output.len() == str_len {
-            self.write_bit(0);
+            self.write_bit(false);
         }
 
         self.output
     }
+
+    /// Perform the compression and return the result.
+    pub fn compress(mut self) -> Vec<u16> {
+        for i in 0..self.input.len() {
+            self.write_u16(i);
+        }
+        self.finish()
+    }
 }
 
 /// Compress a string into a [`Vec<u16>`].
 ///
 /// The resulting [`Vec`] may contain invalid UTF16.
 #[inline]
-pub fn compress(input: impl IntoWideIter) -> Vec<u16> {
-    compress_internal(input.into_wide_iter(), 16, std::convert::identity)
+pub fn compress(data: impl IntoWideIter) -> Vec<u16> {
+    let data: Vec<u16> = data.into_wide_iter().collect();
+    compress_internal(&data, 16, std::convert::identity)
 }
 
 /// Compress a string as a valid [`String`].
 ///
 /// This function converts the result back into a Rust [`String`] since it is guaranteed to be valid UTF16.
 #[inline]
-pub fn compress_to_utf16(input: impl IntoWideIter) -> String {
-    let compressed = compress_internal(input.into_wide_iter(), 15, |n| n + 32);
+pub fn compress_to_utf16(data: impl IntoWideIter) -> String {
+    let data: Vec<u16> = data.into_wide_iter().collect();
+    let compressed = compress_internal(&data, 15, |n| n + 32);
     let mut compressed =
         String::from_utf16(&compressed).expect("`compress_to_utf16 output was not valid unicode`");
     compressed.push(' ');
@@ -191,9 +251,8 @@ pub fn compress_to_utf16(input: impl IntoWideIter) -> String {
 /// This function converts the result back into a Rust [`String`] since it is guaranteed to be valid unicode.
 #[inline]
 pub fn compress_to_encoded_uri_component(data: impl IntoWideIter) -> String {
-    let compressed = compress_internal(data.into_wide_iter(), 6, |n| {
-        u16::from(URI_KEY[usize::from(n)])
-    });
+    let data: Vec<u16> = data.into_wide_iter().collect();
+    let compressed = compress_internal(&data, 6, |n| u16::from(URI_KEY[usize::from(n)]));
 
     String::from_utf16(&compressed)
         .expect("`compress_to_encoded_uri_component` output was not valid unicode`")
@@ -203,9 +262,8 @@ pub fn compress_to_encoded_uri_component(data: impl IntoWideIter) -> String {
 ///
 /// This function converts the result back into a Rust [`String`] since it is guaranteed to be valid unicode.
 pub fn compress_to_base64(data: impl IntoWideIter) -> String {
-    let mut compressed = compress_internal(data.into_wide_iter(), 6, |n| {
-        u16::from(BASE64_KEY[usize::from(n)])
-    });
+    let data: Vec<u16> = data.into_wide_iter().collect();
+    let mut compressed = compress_internal(&data, 6, |n| u16::from(BASE64_KEY[usize::from(n)]));
 
     let mod_4 = compressed.len() % 4;
 
@@ -231,12 +289,10 @@ pub fn compress_to_uint8_array(data: impl IntoWideIter) -> Vec<u8> {
 /// All other compression functions are built on top of this.
 /// It generally should not be used directly.
 #[inline]
-pub fn compress_internal<I, F>(uncompressed: I, bits_per_char: u8, to_char: F) -> Vec<u16>
+pub fn compress_internal<F>(data: &[u16], bits_per_char: u8, to_char: F) -> Vec<u16>
 where
-    I: Iterator<Item = u16>,
     F: Fn(u16) -> u16,
 {
-    let mut ctx = CompressContext::new(bits_per_char, to_char);
-    uncompressed.for_each(|c| ctx.write_u16(c));
-    ctx.finish()
+    let ctx = CompressContext::new(data, bits_per_char, to_char);
+    ctx.compress()
 }
diff --git a/src/decompress.rs b/src/decompress.rs
@@ -1,3 +1,6 @@
+// TODO: Disable this
+#![allow(clippy::cast_possible_truncation)]
+
 use crate::constants::BASE64_KEY;
 use crate::constants::CLOSE_CODE;
 use crate::constants::URI_KEY;

diff --git a/src/lib.rs b/src/lib.rs
@@ -2,8 +2,7 @@
 #![deny(missing_docs)]
 #![warn(clippy::cast_lossless)]
 #![warn(clippy::cast_possible_wrap)]
-// TODO: Enable this
-// #![warn(clippy::cast_possible_truncation)]
+#![warn(clippy::cast_possible_truncation)]
 
 //! A port of [lz-string](https://github.com/pieroxy/lz-string) to Rust.
 //!