From a492cabe5fcf6abaad5e45612bc8366cfda42b87 Mon Sep 17 00:00:00 2001 From: austinabell Date: Sun, 22 Mar 2020 09:25:07 -0400 Subject: [PATCH 1/3] Port over and update dependency versions for RLE+ --- Cargo.toml | 1 + Makefile | 1 + utils/rleplus/Cargo.toml | 13 ++ utils/rleplus/src/lib.rs | 277 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 292 insertions(+) create mode 100644 utils/rleplus/Cargo.toml create mode 100644 utils/rleplus/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index f58d5848edf6..a11f37bdfebf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,4 +24,5 @@ members = [ "ipld/hamt", "math/bigint", "tests/serialization_tests", + "utils/rleplus", ] diff --git a/Makefile b/Makefile index 48b02266bf61..7a0d9474a7e9 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ clean: @cargo clean -p ipld_hamt @cargo clean -p ipld_amt @cargo clean -p forest_bigint + @cargo clean -p rleplus @echo "Done cleaning." lint: license clean diff --git a/utils/rleplus/Cargo.toml b/utils/rleplus/Cargo.toml new file mode 100644 index 000000000000..bdfeabacb1c0 --- /dev/null +++ b/utils/rleplus/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "rleplus" +version = "0.1.0" +authors = ["dignifiedquire ", "ChainSafe Systems "] +edition = "2018" + +[dependencies] +bitvec = "0.17.3" +unsigned-varint = "0.3.1" + +[dev-dependencies] +rand_xorshift = "0.1.1" +rand = "0.6.5" diff --git a/utils/rleplus/src/lib.rs b/utils/rleplus/src/lib.rs new file mode 100644 index 000000000000..576c5ef045b4 --- /dev/null +++ b/utils/rleplus/src/lib.rs @@ -0,0 +1,277 @@ +//! # RLE+ Bitset Encoding +//! +//! RLE+ is a lossless compression format based on [RLE](https://en.wikipedia.org/wiki/Run-length_encoding). +//! It's primary goal is to reduce the size in the case of many individual bits, where RLE breaks down quickly, +//! while keeping the same level of compression for large sets of contigous bits. +//! +//! In tests it has shown to be more compact than RLE iteself, as well as [Concise](https://arxiv.org/pdf/1004.0403.pdf) and [Roaring](https://roaringbitmap.org/). +//! +//! ## Format +//! +//! The format consists of a header, followed by a series of blocks, of which there are three different types. +//! +//! The format can be expressed as the following [BNF](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) grammar. +//! +//! ::=
+//!
::= +//! ::= | | +//! ::= "1" +//! ::= "01" +//! ::= "00" +//! ::= "0" | "1" +//! +//! An `` is defined as specified [here](https://github.com/multiformats/unsigned-varint). +//! +//! ### Header +//! +//! The header indiciates the very first bit of the bit vector to encode. This means the first bit is always +//! the same for the encoded and non encoded form. +//! +//! ### Blocks +//! +//! The blocks represent how many bits, of the current bit type there are. As `0` and `1` alternate in a bit vector +//! the inital bit, which is stored in the header, is enough to determine if a length is currently referencing +//! a set of `0`s, or `1`s. +//! +//! #### Block Single +//! +//! If the running length of the current bit is only `1`, it is encoded as a single set bit. +//! +//! #### Block Short +//! +//! If the running length is less than `16`, it can be encoded into up to four bits, which a short block +//! represents. The length is encoded into a 4 bits, and prefixed with `01`, to indicate a short block. +//! +//! #### Block Long +//! +//! If the running length is `16` or larger, it is encoded into a varint, and then prefixed with `00` to indicate +//! a long block. +//! +//! +//! > **Note:** The encoding is unique, so no matter which algorithm for encoding is used, it should produce +//! > the same encoding, given the same input. +//! + +use bitvec::prelude::{BitVec, Lsb0}; + +/// Encode the given bitset into their RLE+ encoded representation. +pub fn encode(raw: &BitVec) -> BitVec { + let mut encoding = BitVec::new(); + + if raw.is_empty() { + return encoding; + } + + // Header + // encode the very first bit (the first block contains this, then alternating) + encoding.push(*raw.get(0).unwrap()); + + // the running length + let mut count = 1; + + // the current bit type + let mut current = raw.get(0); + + let last = raw.len(); + + for i in 1..=raw.len() { + if raw.get(i) != current || i == last { + if i == last && raw.get(i) == current { + count += 1; + } + + if count == 1 { + // Block Single + encoding.push(true); + } else if count < 16 { + // Block Short + // 4 bits + let s_vec: BitVec = BitVec::from(&[count as u8][..]); + + // prefix: 01 + encoding.push(false); + encoding.push(true); + encoding.extend(s_vec.into_iter().take(4)); + count = 1; + } else { + // Block Long + let mut v = [0u8; 10]; + let s = unsigned_varint::encode::u64(count, &mut v); + let s_vec: BitVec = BitVec::from(s); + + // prefix: 00 + encoding.push(false); + encoding.push(false); + + encoding.extend(s_vec.into_iter()); + count = 1; + } + current = raw.get(i); + } else { + count += 1; + } + } + + encoding +} + +/// Decode an RLE+ encoded bitset into its original form. +pub fn decode(enc: &BitVec) -> BitVec { + let mut decoded = BitVec::new(); + + if enc.is_empty() { + return decoded; + } + + // Header + // read the inital bit + let mut cur = *enc.get(0).unwrap(); + + // pointer into the encoded bitvec + let mut i = 1; + + let len = enc.len(); + + while i < len { + // read the next prefix + match enc.get(i).unwrap() { + false => { + // multiple bits + match enc.get(i + 1) { + Some(false) => { + // Block Long + // prefix: 00 + + let buf = enc + .iter() + .skip(i + 2) + .take(10 * 8) + .cloned() + .collect::>(); + let buf_ref: &[u8] = buf.as_ref(); + let (len, rest) = unsigned_varint::decode::u64(buf_ref).unwrap(); + + // insert this many bits + decoded.extend((0..len).map(|_| cur)); + + // prefix + i += 2; + // this is how much space the varint took in bits + i += (buf_ref.len() * 8) - (rest.len() * 8); + } + Some(true) => { + // Block Short + // prefix: 01 + let buf = enc + .iter() + .skip(i + 2) + .take(4) + .cloned() + .collect::>(); + let res: Vec = buf.into(); + assert_eq!(res.len(), 1); + let len = res[0] as usize; + + // prefix + i += 2; + // length of the encoded number + i += 4; + + decoded.extend((0..len).map(|_| cur)); + } + None => { + panic!("premature end"); + } + } + } + true => { + // Block Signle + decoded.push(cur); + i += 1; + } + } + + // swith the cur value + cur = !cur; + } + + decoded +} + +#[cfg(test)] +mod tests { + use super::*; + + use bitvec::*; + use rand::{Rng, RngCore, SeedableRng}; + use rand_xorshift::XorShiftRng; + + #[test] + fn test_rle_plus_basics() { + let cases: Vec<(BitVec, BitVec)> = vec![ + ( + bitvec![Lsb0, u8; 0; 8], + bitvec![Lsb0, u8; + 0, // starts with 0 + 0, 1, // fits into 4 bits + 0, 0, 0, 1, // 8 + ], + ), + ( + bitvec![Lsb0, u8; 0, 0, 0, 0, 1, 0, 0, 0], + bitvec![Lsb0, u8; + 0, // starts with 0 + 0, 1, // fits into 4 bits + 0, 0, 1, 0, // 4 - 0 + 1, // 1 - 1 + 0, 1, // fits into 4 bits + 1, 1, 0, 0 // 3 - 0 + ], + ), + ]; + + for (i, case) in cases.into_iter().enumerate() { + assert_eq!(encode(&case.0), case.1, "case: {}", i); + } + } + + #[test] + #[ignore] + fn test_rle_plus_roundtrip_small() { + let mut rng = XorShiftRng::from_seed([1u8; 16]); + + for _i in 0..10000 { + let len: usize = rng.gen_range(0, 1000); + + let mut src = vec![0u8; len]; + rng.fill_bytes(&mut src); + + let original: BitVec = src.into(); + + let encoded = encode(&original); + let decoded = decode(&encoded); + + assert_eq!(original, decoded); + } + } + + #[test] + #[ignore] + fn test_rle_plus_roundtrip_large() { + let mut rng = XorShiftRng::from_seed([2u8; 16]); + + for _i in 0..100 { + let len: usize = rng.gen_range(0, 100000); + + let mut src = vec![0u8; len]; + rng.fill_bytes(&mut src); + + let original: BitVec = src.into(); + + let encoded = encode(&original); + let decoded = decode(&encoded); + + assert_eq!(original, decoded); + } + } +} From 135e15cd964fc2b7f1202dcd477706ab29a4d047 Mon Sep 17 00:00:00 2001 From: austinabell Date: Sun, 22 Mar 2020 19:29:28 -0400 Subject: [PATCH 2/3] Update RLE+ encoding to include version and serialization tests --- utils/rleplus/Cargo.toml | 4 ++ utils/rleplus/src/bitvec_serde.rs | 70 +++++++++++++++++++++++++++++++ utils/rleplus/src/lib.rs | 54 +++++++++++++++++------- 3 files changed, 113 insertions(+), 15 deletions(-) create mode 100644 utils/rleplus/src/bitvec_serde.rs diff --git a/utils/rleplus/Cargo.toml b/utils/rleplus/Cargo.toml index bdfeabacb1c0..1c92f9bb4953 100644 --- a/utils/rleplus/Cargo.toml +++ b/utils/rleplus/Cargo.toml @@ -7,7 +7,11 @@ edition = "2018" [dependencies] bitvec = "0.17.3" unsigned-varint = "0.3.1" +serde = { version = "1.0", features = ["derive"] } +serde_bytes = "0.11.3" [dev-dependencies] rand_xorshift = "0.1.1" rand = "0.6.5" +encoding = { package = "forest_encoding", path = "../../encoding/" } +hex = "0.4.0" diff --git a/utils/rleplus/src/bitvec_serde.rs b/utils/rleplus/src/bitvec_serde.rs new file mode 100644 index 000000000000..8f6919d4752c --- /dev/null +++ b/utils/rleplus/src/bitvec_serde.rs @@ -0,0 +1,70 @@ +use super::{decode, encode}; +use bitvec::prelude::{BitVec, Lsb0}; +use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; + +/// Remote derive type for big int /// Wrapper for serializing big ints to match filecoin spec. Serializes as bytes. +pub struct BitVecSer<'a>(pub &'a BitVec); + +/// Wrapper for deserializing as BigInt from bytes. +pub struct BitVecDe(pub BitVec); + +impl Serialize for BitVecSer<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + // This serialize will encode into rle+ before serializing + serde_bytes::serialize(encode(self.0).as_slice(), serializer) + } +} + +impl<'de> Deserialize<'de> for BitVecDe { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + // Deserialize will decode using rle+ decompression + let bz: Vec = serde_bytes::deserialize(deserializer)?; + let compressed = BitVec::from_vec(bz); + Ok(BitVecDe(decode(&compressed).map_err(de::Error::custom)?)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bitvec::bitvec; + use encoding::{from_slice, to_vec}; + + #[test] + fn serialize_node_symmetric() { + let bit_vec = bitvec![Lsb0, u8; 0, 1, 0, 1, 1, 1, 1, 1, 1]; + let cbor_bz = to_vec(&BitVecSer(&bit_vec)).unwrap(); + let BitVecDe(deserialized) = from_slice::(&cbor_bz).unwrap(); + assert_eq!(deserialized.count_ones(), 7); + assert_eq!(deserialized.as_slice(), bit_vec.as_slice()); + } + + #[test] + // ported test from specs-actors `bitfield_test.go` with added vector + fn bit_vec_unset_vector() { + let mut bv: BitVec = BitVec::with_capacity(5); + bv.resize(6, false); + bv.set(1, true); + bv.set(2, true); + bv.set(3, true); + bv.set(4, true); + bv.set(5, true); + + bv.set(3, false); + assert_ne!(bv.get(3), Some(&true)); + assert_eq!(bv.count_ones(), 4); + + // Test cbor marshal and unmarshal + let cbor_bz = to_vec(&BitVecSer(&bv)).unwrap(); + assert_eq!(&cbor_bz, &[0x43, 0xa8, 0x54, 0x0]); + let BitVecDe(deserialized) = from_slice::(&cbor_bz).unwrap(); + + assert_eq!(deserialized.as_slice(), bv.as_slice()); + } +} diff --git a/utils/rleplus/src/lib.rs b/utils/rleplus/src/lib.rs index 576c5ef045b4..4a564d17c3d2 100644 --- a/utils/rleplus/src/lib.rs +++ b/utils/rleplus/src/lib.rs @@ -12,13 +12,17 @@ //! //! The format can be expressed as the following [BNF](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) grammar. //! -//! ::=
-//!
::= -//! ::= | | -//! ::= "1" -//! ::= "01" -//! ::= "00" -//! ::= "0" | "1" +//! ```compile_fail +//! ::=
+//!
::= +//! ::= "00" +//! ::= | "" +//! ::= | | +//! ::= "1" +//! ::= "01" +//! ::= "00" +//! ::= "0" | "1" +//! ``` //! //! An `` is defined as specified [here](https://github.com/multiformats/unsigned-varint). //! @@ -52,7 +56,11 @@ //! > the same encoding, given the same input. //! +mod bitvec_serde; + +pub use bitvec; use bitvec::prelude::{BitVec, Lsb0}; +pub use bitvec_serde::*; /// Encode the given bitset into their RLE+ encoded representation. pub fn encode(raw: &BitVec) -> BitVec { @@ -63,6 +71,7 @@ pub fn encode(raw: &BitVec) -> BitVec { } // Header + // encode the very first bit (the first block contains this, then alternating) encoding.push(*raw.get(0).unwrap()); @@ -112,23 +121,36 @@ pub fn encode(raw: &BitVec) -> BitVec { } } + // encode version "00" + encoding.insert(0, false); + encoding.insert(0, false); + encoding } /// Decode an RLE+ encoded bitset into its original form. -pub fn decode(enc: &BitVec) -> BitVec { +pub fn decode(enc: &BitVec) -> Result, &'static str> { let mut decoded = BitVec::new(); if enc.is_empty() { - return decoded; + return Ok(decoded); } // Header + if enc.len() < 3 { + return Err("Failed to decode, bytes must be at least 3 bits long"); + } + + // read version (expects "00") + if *enc.get(0).unwrap() || *enc.get(1).unwrap() { + return Err("Invalid version, expected '00'"); + } + // read the inital bit let mut cur = *enc.get(0).unwrap(); // pointer into the encoded bitvec - let mut i = 1; + let mut i = 3; let len = enc.len(); @@ -146,7 +168,7 @@ pub fn decode(enc: &BitVec) -> BitVec { .iter() .skip(i + 2) .take(10 * 8) - .cloned() + .copied() .collect::>(); let buf_ref: &[u8] = buf.as_ref(); let (len, rest) = unsigned_varint::decode::u64(buf_ref).unwrap(); @@ -166,7 +188,7 @@ pub fn decode(enc: &BitVec) -> BitVec { .iter() .skip(i + 2) .take(4) - .cloned() + .copied() .collect::>(); let res: Vec = buf.into(); assert_eq!(res.len(), 1); @@ -195,7 +217,7 @@ pub fn decode(enc: &BitVec) -> BitVec { cur = !cur; } - decoded + Ok(decoded) } #[cfg(test)] @@ -212,6 +234,7 @@ mod tests { ( bitvec![Lsb0, u8; 0; 8], bitvec![Lsb0, u8; + 0, 0, // version 0, // starts with 0 0, 1, // fits into 4 bits 0, 0, 0, 1, // 8 @@ -220,6 +243,7 @@ mod tests { ( bitvec![Lsb0, u8; 0, 0, 0, 0, 1, 0, 0, 0], bitvec![Lsb0, u8; + 0, 0, // version 0, // starts with 0 0, 1, // fits into 4 bits 0, 0, 1, 0, // 4 - 0 @@ -249,7 +273,7 @@ mod tests { let original: BitVec = src.into(); let encoded = encode(&original); - let decoded = decode(&encoded); + let decoded = decode(&encoded).unwrap(); assert_eq!(original, decoded); } @@ -269,7 +293,7 @@ mod tests { let original: BitVec = src.into(); let encoded = encode(&original); - let decoded = decode(&encoded); + let decoded = decode(&encoded).unwrap(); assert_eq!(original, decoded); } From 00d19da7c678b800177271e4b185b2aa46b9ec91 Mon Sep 17 00:00:00 2001 From: austinabell Date: Sun, 22 Mar 2020 19:38:03 -0400 Subject: [PATCH 3/3] Lint and improve error handling --- utils/rleplus/src/bitvec_serde.rs | 7 +++++-- utils/rleplus/src/lib.rs | 23 +++++++++++++++-------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/utils/rleplus/src/bitvec_serde.rs b/utils/rleplus/src/bitvec_serde.rs index 8f6919d4752c..12ed37728c62 100644 --- a/utils/rleplus/src/bitvec_serde.rs +++ b/utils/rleplus/src/bitvec_serde.rs @@ -1,11 +1,14 @@ +// Copyright 2020 ChainSafe Systems +// SPDX-License-Identifier: Apache-2.0, MIT + use super::{decode, encode}; use bitvec::prelude::{BitVec, Lsb0}; use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; -/// Remote derive type for big int /// Wrapper for serializing big ints to match filecoin spec. Serializes as bytes. +/// Wrapper for serializing bit vector with RLE+ encoding pub struct BitVecSer<'a>(pub &'a BitVec); -/// Wrapper for deserializing as BigInt from bytes. +/// Wrapper for deserializing bit vector with RLE+ decoding from bytes. pub struct BitVecDe(pub BitVec); impl Serialize for BitVecSer<'_> { diff --git a/utils/rleplus/src/lib.rs b/utils/rleplus/src/lib.rs index 4a564d17c3d2..6d1efd2c953d 100644 --- a/utils/rleplus/src/lib.rs +++ b/utils/rleplus/src/lib.rs @@ -1,3 +1,6 @@ +// Copyright 2020 ChainSafe Systems +// SPDX-License-Identifier: Apache-2.0, MIT + //! # RLE+ Bitset Encoding //! //! RLE+ is a lossless compression format based on [RLE](https://en.wikipedia.org/wiki/Run-length_encoding). @@ -71,6 +74,9 @@ pub fn encode(raw: &BitVec) -> BitVec { } // Header + // encode version "00" and push to start of encoding + encoding.insert(0, false); + encoding.insert(0, false); // encode the very first bit (the first block contains this, then alternating) encoding.push(*raw.get(0).unwrap()); @@ -121,10 +127,6 @@ pub fn encode(raw: &BitVec) -> BitVec { } } - // encode version "00" - encoding.insert(0, false); - encoding.insert(0, false); - encoding } @@ -147,7 +149,7 @@ pub fn decode(enc: &BitVec) -> Result, &'static str> } // read the inital bit - let mut cur = *enc.get(0).unwrap(); + let mut cur = *enc.get(2).unwrap(); // pointer into the encoded bitvec let mut i = 3; @@ -171,7 +173,8 @@ pub fn decode(enc: &BitVec) -> Result, &'static str> .copied() .collect::>(); let buf_ref: &[u8] = buf.as_ref(); - let (len, rest) = unsigned_varint::decode::u64(buf_ref).unwrap(); + let (len, rest) = unsigned_varint::decode::u64(buf_ref) + .map_err(|_| "Failed to decode uvarint")?; // insert this many bits decoded.extend((0..len).map(|_| cur)); @@ -191,7 +194,11 @@ pub fn decode(enc: &BitVec) -> Result, &'static str> .copied() .collect::>(); let res: Vec = buf.into(); - assert_eq!(res.len(), 1); + + if res.len() != 1 { + return Err("Invalid short block encoding"); + } + let len = res[0] as usize; // prefix @@ -202,7 +209,7 @@ pub fn decode(enc: &BitVec) -> Result, &'static str> decoded.extend((0..len).map(|_| cur)); } None => { - panic!("premature end"); + return Err("premature end to bits"); } } }