Skip to content

Commit

Permalink
Clean compression impl (#28)
Browse files Browse the repository at this point in the history
* Remove `as` casting while compressing

* Remove `dict_size`

* Move some floating numbers to constants

* fmt

* Make input buffer part of `CompressContext`

* Remove wc from ctx

* Use slices instead of vecs for compression

* Make `compress_internal` operate on arrays

* Make `dictionary_to_create` use chars instead of slices

* Preallocate more space for output

* Change `enlarge_in` size
  • Loading branch information
adumbidiot authored Oct 20, 2022
1 parent 735e012 commit 50b0435
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 76 deletions.
204 changes: 130 additions & 74 deletions src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,50 @@ use crate::constants::URI_KEY;
use crate::IntoWideIter;
use std::collections::HashMap;
use std::collections::HashSet;
use std::convert::TryInto;

/// The starting size of a codepoint.
///
/// Compression starts with the following codes:
/// 0: u8
/// 1: u16
/// 2: close stream
const START_NUM_BITS: u8 = 2;

/// The stream code for a `u8`.
const U8_CODE: u32 = 0;

/// The stream code for a `u16`.
const U16_CODE: u32 = 1;

/// The number of "base codes",
/// the default codes of all streams.
///
/// These are U8_CODE, U16_CODE, and CLOSE_CODE.
const NUM_BASE_CODES: usize = 3;

#[derive(Debug)]
pub(crate) struct CompressContext<F> {
dictionary: HashMap<Vec<u16>, u32>,
dictionary_to_create: HashSet<Vec<u16>>,
wc: Vec<u16>,
w: Vec<u16>,
enlarge_in: usize,
dict_size: usize,
pub(crate) struct CompressContext<'a, F> {
dictionary: HashMap<&'a [u16], u32>,
dictionary_to_create: HashSet<u16>,

/// The current word, w,
/// in terms of indexes into the input.
w_start_idx: usize,
w_end_idx: usize,

// The counter for increasing the current number of bits in a code.
// The max size of this is 1 << max(num_bits) == 1 + u32::MAX, so we use u64.
enlarge_in: u64,

/// The input buffer.
input: &'a [u16],

/// The output buffer.
output: Vec<u16>,

/// The bit buffer.
bit_buffer: u16,

/// The current number of bits in a code.
///
Expand All @@ -21,12 +56,6 @@ pub(crate) struct CompressContext<F> {
/// 32 < u8::MAX
num_bits: u8,

// result: Vec<u16>,

// Data
output: Vec<u16>,
val: u16,

/// The current bit position.
bit_position: u8,

Expand All @@ -41,30 +70,33 @@ pub(crate) struct CompressContext<F> {
to_char: F,
}

impl<F> CompressContext<F>
impl<'a, F> CompressContext<'a, F>
where
F: Fn(u16) -> u16,
{
/// Make a new [`CompressContext`].
///
/// # Panics
/// Panics if `bits_per_char` exceeds 16.
/// Panics if `bits_per_char` exceeds the number of bits in a u16.
#[inline]
pub fn new(bits_per_char: u8, to_char: F) -> Self {
assert!(bits_per_char <= 16);
pub fn new(input: &'a [u16], bits_per_char: u8, to_char: F) -> Self {
assert!(usize::from(bits_per_char) <= std::mem::size_of::<u16>() * 8);

CompressContext {
dictionary: HashMap::with_capacity(16),
dictionary_to_create: HashSet::with_capacity(16),
wc: Vec::new(),
w: Vec::new(),

w_start_idx: 0,
w_end_idx: 0,

enlarge_in: 2,
dict_size: 3,
num_bits: 2,

// result: Vec::new(),
output: Vec::new(),
val: 0,
input,
output: Vec::with_capacity(input.len() >> 1), // Lowball, assume we can get a 50% reduction in size.

bit_buffer: 0,

num_bits: START_NUM_BITS,

bit_position: 0,
bits_per_char,
Expand All @@ -74,39 +106,48 @@ where

#[inline]
pub fn produce_w(&mut self) {
if self.dictionary_to_create.contains(&self.w) {
let first_w_char = self.w[0];
if first_w_char < 256 {
self.write_bits(self.num_bits, 0);
self.write_bits(8, first_w_char.into());
} else {
self.write_bits(self.num_bits, 1);
self.write_bits(16, first_w_char.into());
let w = &self.input[self.w_start_idx..self.w_end_idx];

match w
.first()
.map(|first_w_char| self.dictionary_to_create.take(first_w_char))
{
Some(Some(first_w_char)) => {
if first_w_char < 256 {
self.write_bits(self.num_bits, U8_CODE);
self.write_bits(8, first_w_char.into());
} else {
self.write_bits(self.num_bits, U16_CODE);
self.write_bits(16, first_w_char.into());
}
self.decrement_enlarge_in();
}
None | Some(None) => {
self.write_bits(self.num_bits, *self.dictionary.get(w).unwrap());
}
self.decrement_enlarge_in();
self.dictionary_to_create.remove(&self.w);
} else {
self.write_bits(self.num_bits, *self.dictionary.get(&self.w).unwrap());
}
self.decrement_enlarge_in();
}

/// Append the bit to the bit buffer.
#[inline]
pub fn write_bit(&mut self, value: u32) {
self.val = (self.val << 1) | (value as u16);
pub fn write_bit(&mut self, bit: bool) {
self.bit_buffer = (self.bit_buffer << 1) | u16::from(bit);
self.bit_position += 1;

if self.bit_position == self.bits_per_char {
self.bit_position = 0;
let char_data = (self.to_char)(self.val);
self.output.push(char_data);
self.val = 0;
let output_char = (self.to_char)(self.bit_buffer);
self.bit_buffer = 0;

self.output.push(output_char);
}
}

#[inline]
pub fn write_bits(&mut self, n: u8, mut value: u32) {
for _ in 0..n {
self.write_bit(value & 1);
self.write_bit(value & 1 == 1);
value >>= 1;
}
}
Expand All @@ -115,40 +156,49 @@ where
pub fn decrement_enlarge_in(&mut self) {
self.enlarge_in -= 1;
if self.enlarge_in == 0 {
self.enlarge_in = 2_usize.pow(self.num_bits.into());
self.enlarge_in = 1 << self.num_bits;
self.num_bits += 1;
}
}

/// Compress a `u16`. This represents a wide char.
#[inline]
pub fn write_u16(&mut self, c: u16) {
let c = vec![c];
if !self.dictionary.contains_key(&c) {
self.dictionary.insert(c.clone(), self.dict_size as u32);
self.dict_size += 1;
self.dictionary_to_create.insert(c.clone());
pub fn write_u16(&mut self, i: usize) {
let c = &self.input[i];
if !self.dictionary.contains_key(std::slice::from_ref(c)) {
self.dictionary.insert(
std::slice::from_ref(c),
(self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(),
);
self.dictionary_to_create.insert(*c);
}

self.wc = self.w.clone();
self.wc.extend(&c);
if self.dictionary.contains_key(&self.wc) {
self.w = std::mem::take(&mut self.wc);
// wc = w + c.
let wc = &self.input[self.w_start_idx..self.w_end_idx + 1];
if self.dictionary.contains_key(wc) {
// w = wc.
self.w_end_idx += 1;
} else {
self.produce_w();
// Add wc to the dictionary.
self.dictionary
.insert(self.wc.clone(), self.dict_size as u32);
self.dict_size += 1;
self.w = c;
self.dictionary.insert(
wc,
(self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(),
);

// w = c.
self.w_start_idx = i;
self.w_end_idx = i + 1;
}
}

/// Finish the stream and get the final result.
#[inline]
pub fn finish(mut self) -> Vec<u16> {
let w = &self.input[self.w_start_idx..self.w_end_idx];

// Output the code for w.
if !self.w.is_empty() {
if !w.is_empty() {
self.produce_w();
}

Expand All @@ -158,27 +208,37 @@ where
let str_len = self.output.len();
// Flush the last char
while self.output.len() == str_len {
self.write_bit(0);
self.write_bit(false);
}

self.output
}

/// Perform the compression and return the result.
pub fn compress(mut self) -> Vec<u16> {
for i in 0..self.input.len() {
self.write_u16(i);
}
self.finish()
}
}

/// Compress a string into a [`Vec<u16>`].
///
/// The resulting [`Vec`] may contain invalid UTF16.
#[inline]
pub fn compress(input: impl IntoWideIter) -> Vec<u16> {
compress_internal(input.into_wide_iter(), 16, std::convert::identity)
pub fn compress(data: impl IntoWideIter) -> Vec<u16> {
let data: Vec<u16> = data.into_wide_iter().collect();
compress_internal(&data, 16, std::convert::identity)
}

/// Compress a string as a valid [`String`].
///
/// This function converts the result back into a Rust [`String`] since it is guaranteed to be valid UTF16.
#[inline]
pub fn compress_to_utf16(input: impl IntoWideIter) -> String {
let compressed = compress_internal(input.into_wide_iter(), 15, |n| n + 32);
pub fn compress_to_utf16(data: impl IntoWideIter) -> String {
let data: Vec<u16> = data.into_wide_iter().collect();
let compressed = compress_internal(&data, 15, |n| n + 32);
let mut compressed =
String::from_utf16(&compressed).expect("`compress_to_utf16 output was not valid unicode`");
compressed.push(' ');
Expand All @@ -191,9 +251,8 @@ pub fn compress_to_utf16(input: impl IntoWideIter) -> String {
/// This function converts the result back into a Rust [`String`] since it is guaranteed to be valid unicode.
#[inline]
pub fn compress_to_encoded_uri_component(data: impl IntoWideIter) -> String {
let compressed = compress_internal(data.into_wide_iter(), 6, |n| {
u16::from(URI_KEY[usize::from(n)])
});
let data: Vec<u16> = data.into_wide_iter().collect();
let compressed = compress_internal(&data, 6, |n| u16::from(URI_KEY[usize::from(n)]));

String::from_utf16(&compressed)
.expect("`compress_to_encoded_uri_component` output was not valid unicode`")
Expand All @@ -203,9 +262,8 @@ pub fn compress_to_encoded_uri_component(data: impl IntoWideIter) -> String {
///
/// This function converts the result back into a Rust [`String`] since it is guaranteed to be valid unicode.
pub fn compress_to_base64(data: impl IntoWideIter) -> String {
let mut compressed = compress_internal(data.into_wide_iter(), 6, |n| {
u16::from(BASE64_KEY[usize::from(n)])
});
let data: Vec<u16> = data.into_wide_iter().collect();
let mut compressed = compress_internal(&data, 6, |n| u16::from(BASE64_KEY[usize::from(n)]));

let mod_4 = compressed.len() % 4;

Expand All @@ -231,12 +289,10 @@ pub fn compress_to_uint8_array(data: impl IntoWideIter) -> Vec<u8> {
/// All other compression functions are built on top of this.
/// It generally should not be used directly.
#[inline]
pub fn compress_internal<I, F>(uncompressed: I, bits_per_char: u8, to_char: F) -> Vec<u16>
pub fn compress_internal<F>(data: &[u16], bits_per_char: u8, to_char: F) -> Vec<u16>
where
I: Iterator<Item = u16>,
F: Fn(u16) -> u16,
{
let mut ctx = CompressContext::new(bits_per_char, to_char);
uncompressed.for_each(|c| ctx.write_u16(c));
ctx.finish()
let ctx = CompressContext::new(data, bits_per_char, to_char);
ctx.compress()
}
3 changes: 3 additions & 0 deletions src/decompress.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// TODO: Disable this
#![allow(clippy::cast_possible_truncation)]

use crate::constants::BASE64_KEY;
use crate::constants::CLOSE_CODE;
use crate::constants::URI_KEY;
Expand Down
3 changes: 1 addition & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
#![deny(missing_docs)]
#![warn(clippy::cast_lossless)]
#![warn(clippy::cast_possible_wrap)]
// TODO: Enable this
// #![warn(clippy::cast_possible_truncation)]
#![warn(clippy::cast_possible_truncation)]

//! A port of [lz-string](https://github.com/pieroxy/lz-string) to Rust.
//!
Expand Down

0 comments on commit 50b0435

Please sign in to comment.