From e1eb46d1d4a54cd117d23e818fd488cc914c0f99 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sat, 29 Jun 2024 16:32:35 +0200 Subject: [PATCH] refactor(rust): remove seqmacro and u8,u16 bitpack After looking for a long time at how to SIMD this bitpacking, I am pausing for a bit to work on something else. There are some code deletions, namely removing the unused u8 and u16 implementations of bit(un)packing. This also removes the `seq_macro` crate. Maybe, I will look at this again in the future. --- Cargo.lock | 7 -- crates/polars-parquet/Cargo.toml | 1 - .../src/parquet/encoding/bitpacked/mod.rs | 84 ++++++++++++------- .../src/parquet/encoding/bitpacked/pack.rs | 52 +----------- .../src/parquet/encoding/bitpacked/unpack.rs | 42 +++++----- 5 files changed, 77 insertions(+), 109 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 66aad84a20d6..2cc791536a93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3282,7 +3282,6 @@ dependencies = [ "polars-error", "polars-utils", "rand", - "seq-macro", "serde", "simdutf8", "snap", @@ -4266,12 +4265,6 @@ dependencies = [ "serde", ] -[[package]] -name = "seq-macro" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" - [[package]] name = "serde" version = "1.0.203" diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml index b30d86df794b..cdf727481976 100644 --- a/crates/polars-parquet/Cargo.toml +++ b/crates/polars-parquet/Cargo.toml @@ -26,7 +26,6 @@ polars-utils = { workspace = true } simdutf8 = { workspace = true } parquet-format-safe = "0.2" -seq-macro = { version = "0.3", default-features = false } streaming-decompression = "0.1" async-stream = { version = "0.3.3", optional = true } diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs index 4d80a43ed879..ef6c5313ba26 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs @@ -1,3 +1,57 @@ +macro_rules! seq_macro { + ($i:ident in 1..31 $block:block) => { + seq_macro!($i in [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + ] $block) + }; + ($i:ident in 0..32 $block:block) => { + seq_macro!($i in [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ] $block) + }; + ($i:ident in 0..=32 $block:block) => { + seq_macro!($i in [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, + ] $block) + }; + ($i:ident in 1..63 $block:block) => { + seq_macro!($i in [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + ] $block) + }; + ($i:ident in 0..64 $block:block) => { + seq_macro!($i in [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + ] $block) + }; + ($i:ident in 0..=64 $block:block) => { + seq_macro!($i in [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, + ] $block) + }; + ($i:ident in [$($value:literal),+ $(,)?] $block:block) => { + $({ + #[allow(non_upper_case_globals)] + const $i: usize = $value; + { $block } + })+ + }; +} + mod decode; mod encode; mod pack; @@ -105,36 +159,6 @@ pub trait Unpackable: Copy + Sized + Default { fn pack(unpacked: &Self::Unpacked, num_bits: usize, packed: &mut [u8]); } -impl Unpackable for u8 { - type Packed = [u8; 8]; - type Unpacked = [u8; 8]; - - #[inline] - fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) { - unpack::unpack8(packed, unpacked, num_bits) - } - - #[inline] - fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) { - pack::pack8(packed, unpacked, num_bits) - } -} - -impl Unpackable for u16 { - type Packed = [u8; 16 * 2]; - type Unpacked = [u16; 16]; - - #[inline] - fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) { - unpack::unpack16(packed, unpacked, num_bits) - } - - #[inline] - fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) { - pack::pack16(packed, unpacked, num_bits) - } -} - impl Unpackable for u32 { type Packed = [u8; 32 * 4]; type Unpacked = [u32; 32]; diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs index bb8263ac4b23..c318f42649d3 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs @@ -23,7 +23,7 @@ macro_rules! pack_impl { // Using microbenchmark (79d1fff), unrolling this loop is over 10x // faster than not (>20x faster than old algorithm) - seq_macro::seq!(i in 1..$bits_minus_one { + seq_macro!(i in 1..$bits_minus_one { let bits_filled: usize = i * NUM_BITS; let inner_cursor: usize = bits_filled % $bits; let remaining: usize = $bits - inner_cursor; @@ -69,7 +69,7 @@ macro_rules! pack { /// Pack unpacked `input` into `output` with a bit width of `num_bits` pub fn $name(input: &[$t; $bits], output: &mut [u8], num_bits: usize) { // This will get optimised into a jump table - seq_macro::seq!(i in 0..=$bits { + seq_macro!(i in 0..=$bits { if i == num_bits { unsafe { return $name::pack::(input, output); @@ -81,8 +81,6 @@ macro_rules! pack { }; } -pack!(pack8, u8, 1, 8, 7); -pack!(pack16, u16, 2, 16, 15); pack!(pack32, u32, 4, 32, 31); pack!(pack64, u64, 8, 64, 63); @@ -93,18 +91,6 @@ mod tests { use super::super::unpack::*; use super::*; - #[test] - fn test_basic() { - let input = [0u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - for num_bits in 4..16 { - let mut output = [0u8; 16 * 2]; - pack16(&input, &mut output, num_bits); - let mut other = [0u16; 16]; - unpack16(&output, &mut other, num_bits); - assert_eq!(other, input); - } - } - #[test] fn test_u32() { let input = [ @@ -120,40 +106,6 @@ mod tests { } } - #[test] - fn test_u8_random() { - let mut rng = rand::thread_rng(); - let mut random_array = [0u8; 8]; - let between = Uniform::from(0..6); - for num_bits in 3..=8 { - for i in &mut random_array { - *i = between.sample(&mut rng); - } - let mut output = [0u8; 8]; - pack8(&random_array, &mut output, num_bits); - let mut other = [0u8; 8]; - unpack8(&output, &mut other, num_bits); - assert_eq!(other, random_array); - } - } - - #[test] - fn test_u16_random() { - let mut rng = rand::thread_rng(); - let mut random_array = [0u16; 16]; - let between = Uniform::from(0..128); - for num_bits in 7..=16 { - for i in &mut random_array { - *i = between.sample(&mut rng); - } - let mut output = [0u8; 16 * 2]; - pack16(&random_array, &mut output, num_bits); - let mut other = [0u16; 16]; - unpack16(&output, &mut other, num_bits); - assert_eq!(other, random_array); - } - } - #[test] fn test_u32_random() { let mut rng = rand::thread_rng(); diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs index ab6dbc00d5a5..44bbedee7a31 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs @@ -14,9 +14,27 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - +// // Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs +// This implements bit unpacking. For example, for `u8` and `num_bits=3`. +// 0b001_101_110 -> 0b0000_0001, 0b0000_0101, 0b0000_0110 +// +// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very +// highly used functions in Parquet and therefore this that been extensively unrolled and +// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off +// in comparison to auto-vectorization. +// +// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of +// runtime. +// +// 1. Each individual function unrolled to a point where all constants are known to +// the compiler. In microbenchmarks, this increases the performance by around 4.5 +// to 5 times. +// 2. All functions are compiled seperately and dispatch is done using a +// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.5 +// times. + /// Macro that generates an unpack function taking the number of bits as a const generic macro_rules! unpack_impl { ($t:ty, $bytes:literal, $bits:tt) => { @@ -50,7 +68,7 @@ macro_rules! unpack_impl { // performance in a microbenchmark. Although the code it generates is completely // insane. There should be something we can do here to make this less code, sane code // and faster code. - seq_macro::seq!(i in 0..$bits { + seq_macro!(i in 0..$bits { let start_bit = i * NUM_BITS; let end_bit = start_bit + NUM_BITS; @@ -88,7 +106,7 @@ macro_rules! unpack { // @NOTE // This jumptable appoach saves around 2 - 2.5x on performance over no jumptable and no // generics. - seq_macro::seq!(i in 0..=$bits { + seq_macro!(i in 0..=$bits { if i == num_bits { return $name::unpack::(input, output); } @@ -98,8 +116,6 @@ macro_rules! unpack { }; } -unpack!(unpack8, u8, 1, 8); -unpack!(unpack16, u16, 2, 16); unpack!(unpack32, u32, 4, 32); unpack!(unpack64, u64, 8, 64); @@ -111,22 +127,6 @@ mod tests { fn test_basic() { let input = [0xFF; 4096]; - for i in 0..=8 { - let mut output = [0; 8]; - unpack8(&input, &mut output, i); - for (idx, out) in output.iter().enumerate() { - assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); - } - } - - for i in 0..=16 { - let mut output = [0; 16]; - unpack16(&input, &mut output, i); - for (idx, out) in output.iter().enumerate() { - assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); - } - } - for i in 0..=32 { let mut output = [0; 32]; unpack32(&input, &mut output, i);