Skip to content

Commit

Permalink
refactor(rust): remove seqmacro and u8,u16 bitpack
Browse files Browse the repository at this point in the history
After looking for a long time at how to SIMD this bitpacking, I am pausing for a bit to work on something else. There are some code deletions, namely removing the unused u8 and u16 implementations of bit(un)packing. This also removes the `seq_macro` crate.

Maybe, I will look at this again in the future.
  • Loading branch information
coastalwhite committed Jun 29, 2024
1 parent 8b72169 commit e1eb46d
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 109 deletions.
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion crates/polars-parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ polars-utils = { workspace = true }
simdutf8 = { workspace = true }

parquet-format-safe = "0.2"
seq-macro = { version = "0.3", default-features = false }
streaming-decompression = "0.1"

async-stream = { version = "0.3.3", optional = true }
Expand Down
84 changes: 54 additions & 30 deletions crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,57 @@
macro_rules! seq_macro {
($i:ident in 1..31 $block:block) => {
seq_macro!($i in [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
] $block)
};
($i:ident in 0..32 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
] $block)
};
($i:ident in 0..=32 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32,
] $block)
};
($i:ident in 1..63 $block:block) => {
seq_macro!($i in [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
] $block)
};
($i:ident in 0..64 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
] $block)
};
($i:ident in 0..=64 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64,
] $block)
};
($i:ident in [$($value:literal),+ $(,)?] $block:block) => {
$({
#[allow(non_upper_case_globals)]
const $i: usize = $value;
{ $block }
})+
};
}

mod decode;
mod encode;
mod pack;
Expand Down Expand Up @@ -105,36 +159,6 @@ pub trait Unpackable: Copy + Sized + Default {
fn pack(unpacked: &Self::Unpacked, num_bits: usize, packed: &mut [u8]);
}

impl Unpackable for u8 {
type Packed = [u8; 8];
type Unpacked = [u8; 8];

#[inline]
fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) {
unpack::unpack8(packed, unpacked, num_bits)
}

#[inline]
fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) {
pack::pack8(packed, unpacked, num_bits)
}
}

impl Unpackable for u16 {
type Packed = [u8; 16 * 2];
type Unpacked = [u16; 16];

#[inline]
fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) {
unpack::unpack16(packed, unpacked, num_bits)
}

#[inline]
fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) {
pack::pack16(packed, unpacked, num_bits)
}
}

impl Unpackable for u32 {
type Packed = [u8; 32 * 4];
type Unpacked = [u32; 32];
Expand Down
52 changes: 2 additions & 50 deletions crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ macro_rules! pack_impl {

// Using microbenchmark (79d1fff), unrolling this loop is over 10x
// faster than not (>20x faster than old algorithm)
seq_macro::seq!(i in 1..$bits_minus_one {
seq_macro!(i in 1..$bits_minus_one {
let bits_filled: usize = i * NUM_BITS;
let inner_cursor: usize = bits_filled % $bits;
let remaining: usize = $bits - inner_cursor;
Expand Down Expand Up @@ -69,7 +69,7 @@ macro_rules! pack {
/// Pack unpacked `input` into `output` with a bit width of `num_bits`
pub fn $name(input: &[$t; $bits], output: &mut [u8], num_bits: usize) {
// This will get optimised into a jump table
seq_macro::seq!(i in 0..=$bits {
seq_macro!(i in 0..=$bits {
if i == num_bits {
unsafe {
return $name::pack::<i>(input, output);
Expand All @@ -81,8 +81,6 @@ macro_rules! pack {
};
}

pack!(pack8, u8, 1, 8, 7);
pack!(pack16, u16, 2, 16, 15);
pack!(pack32, u32, 4, 32, 31);
pack!(pack64, u64, 8, 64, 63);

Expand All @@ -93,18 +91,6 @@ mod tests {
use super::super::unpack::*;
use super::*;

#[test]
fn test_basic() {
let input = [0u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
for num_bits in 4..16 {
let mut output = [0u8; 16 * 2];
pack16(&input, &mut output, num_bits);
let mut other = [0u16; 16];
unpack16(&output, &mut other, num_bits);
assert_eq!(other, input);
}
}

#[test]
fn test_u32() {
let input = [
Expand All @@ -120,40 +106,6 @@ mod tests {
}
}

#[test]
fn test_u8_random() {
let mut rng = rand::thread_rng();
let mut random_array = [0u8; 8];
let between = Uniform::from(0..6);
for num_bits in 3..=8 {
for i in &mut random_array {
*i = between.sample(&mut rng);
}
let mut output = [0u8; 8];
pack8(&random_array, &mut output, num_bits);
let mut other = [0u8; 8];
unpack8(&output, &mut other, num_bits);
assert_eq!(other, random_array);
}
}

#[test]
fn test_u16_random() {
let mut rng = rand::thread_rng();
let mut random_array = [0u16; 16];
let between = Uniform::from(0..128);
for num_bits in 7..=16 {
for i in &mut random_array {
*i = between.sample(&mut rng);
}
let mut output = [0u8; 16 * 2];
pack16(&random_array, &mut output, num_bits);
let mut other = [0u16; 16];
unpack16(&output, &mut other, num_bits);
assert_eq!(other, random_array);
}
}

#[test]
fn test_u32_random() {
let mut rng = rand::thread_rng();
Expand Down
42 changes: 21 additions & 21 deletions crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,27 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//
// Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs

// This implements bit unpacking. For example, for `u8` and `num_bits=3`.
// 0b001_101_110 -> 0b0000_0001, 0b0000_0101, 0b0000_0110
//
// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very
// highly used functions in Parquet and therefore this that been extensively unrolled and
// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off
// in comparison to auto-vectorization.
//
// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of
// runtime.
//
// 1. Each individual function unrolled to a point where all constants are known to
// the compiler. In microbenchmarks, this increases the performance by around 4.5
// to 5 times.
// 2. All functions are compiled seperately and dispatch is done using a

Check warning on line 34 in crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs

View workflow job for this annotation

GitHub Actions / main

"seperately" should be "separately".
// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.5
// times.

/// Macro that generates an unpack function taking the number of bits as a const generic
macro_rules! unpack_impl {
($t:ty, $bytes:literal, $bits:tt) => {
Expand Down Expand Up @@ -50,7 +68,7 @@ macro_rules! unpack_impl {
// performance in a microbenchmark. Although the code it generates is completely
// insane. There should be something we can do here to make this less code, sane code
// and faster code.
seq_macro::seq!(i in 0..$bits {
seq_macro!(i in 0..$bits {
let start_bit = i * NUM_BITS;
let end_bit = start_bit + NUM_BITS;

Expand Down Expand Up @@ -88,7 +106,7 @@ macro_rules! unpack {
// @NOTE
// This jumptable appoach saves around 2 - 2.5x on performance over no jumptable and no
// generics.
seq_macro::seq!(i in 0..=$bits {
seq_macro!(i in 0..=$bits {
if i == num_bits {
return $name::unpack::<i>(input, output);
}
Expand All @@ -98,8 +116,6 @@ macro_rules! unpack {
};
}

unpack!(unpack8, u8, 1, 8);
unpack!(unpack16, u16, 2, 16);
unpack!(unpack32, u32, 4, 32);
unpack!(unpack64, u64, 8, 64);

Expand All @@ -111,22 +127,6 @@ mod tests {
fn test_basic() {
let input = [0xFF; 4096];

for i in 0..=8 {
let mut output = [0; 8];
unpack8(&input, &mut output, i);
for (idx, out) in output.iter().enumerate() {
assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out);
}
}

for i in 0..=16 {
let mut output = [0; 16];
unpack16(&input, &mut output, i);
for (idx, out) in output.iter().enumerate() {
assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out);
}
}

for i in 0..=32 {
let mut output = [0; 32];
unpack32(&input, &mut output, i);
Expand Down

0 comments on commit e1eb46d

Please sign in to comment.