Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(rust): remove seqmacro and u8,u16 bitpack #17290

Merged
merged 2 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion crates/polars-parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ polars-utils = { workspace = true }
simdutf8 = { workspace = true }

parquet-format-safe = "0.2"
seq-macro = { version = "0.3", default-features = false }
streaming-decompression = "0.1"

async-stream = { version = "0.3.3", optional = true }
Expand Down
84 changes: 54 additions & 30 deletions crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,57 @@
macro_rules! seq_macro {
($i:ident in 1..31 $block:block) => {
seq_macro!($i in [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
] $block)
};
($i:ident in 0..32 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
] $block)
};
($i:ident in 0..=32 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32,
] $block)
};
($i:ident in 1..63 $block:block) => {
seq_macro!($i in [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
] $block)
};
($i:ident in 0..64 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
] $block)
};
($i:ident in 0..=64 $block:block) => {
seq_macro!($i in [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64,
] $block)
};
($i:ident in [$($value:literal),+ $(,)?] $block:block) => {
$({
#[allow(non_upper_case_globals)]
const $i: usize = $value;
{ $block }
})+
};
}

mod decode;
mod encode;
mod pack;
Expand Down Expand Up @@ -105,36 +159,6 @@ pub trait Unpackable: Copy + Sized + Default {
fn pack(unpacked: &Self::Unpacked, num_bits: usize, packed: &mut [u8]);
}

impl Unpackable for u8 {
type Packed = [u8; 8];
type Unpacked = [u8; 8];

#[inline]
fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) {
unpack::unpack8(packed, unpacked, num_bits)
}

#[inline]
fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) {
pack::pack8(packed, unpacked, num_bits)
}
}

impl Unpackable for u16 {
type Packed = [u8; 16 * 2];
type Unpacked = [u16; 16];

#[inline]
fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) {
unpack::unpack16(packed, unpacked, num_bits)
}

#[inline]
fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) {
pack::pack16(packed, unpacked, num_bits)
}
}

impl Unpackable for u32 {
type Packed = [u8; 32 * 4];
type Unpacked = [u32; 32];
Expand Down
52 changes: 2 additions & 50 deletions crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ macro_rules! pack_impl {

// Using microbenchmark (79d1fff), unrolling this loop is over 10x
// faster than not (>20x faster than old algorithm)
seq_macro::seq!(i in 1..$bits_minus_one {
seq_macro!(i in 1..$bits_minus_one {
let bits_filled: usize = i * NUM_BITS;
let inner_cursor: usize = bits_filled % $bits;
let remaining: usize = $bits - inner_cursor;
Expand Down Expand Up @@ -69,7 +69,7 @@ macro_rules! pack {
/// Pack unpacked `input` into `output` with a bit width of `num_bits`
pub fn $name(input: &[$t; $bits], output: &mut [u8], num_bits: usize) {
// This will get optimised into a jump table
seq_macro::seq!(i in 0..=$bits {
seq_macro!(i in 0..=$bits {
if i == num_bits {
unsafe {
return $name::pack::<i>(input, output);
Expand All @@ -81,8 +81,6 @@ macro_rules! pack {
};
}

pack!(pack8, u8, 1, 8, 7);
pack!(pack16, u16, 2, 16, 15);
pack!(pack32, u32, 4, 32, 31);
pack!(pack64, u64, 8, 64, 63);

Expand All @@ -93,18 +91,6 @@ mod tests {
use super::super::unpack::*;
use super::*;

#[test]
fn test_basic() {
let input = [0u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
for num_bits in 4..16 {
let mut output = [0u8; 16 * 2];
pack16(&input, &mut output, num_bits);
let mut other = [0u16; 16];
unpack16(&output, &mut other, num_bits);
assert_eq!(other, input);
}
}

#[test]
fn test_u32() {
let input = [
Expand All @@ -120,40 +106,6 @@ mod tests {
}
}

#[test]
fn test_u8_random() {
let mut rng = rand::thread_rng();
let mut random_array = [0u8; 8];
let between = Uniform::from(0..6);
for num_bits in 3..=8 {
for i in &mut random_array {
*i = between.sample(&mut rng);
}
let mut output = [0u8; 8];
pack8(&random_array, &mut output, num_bits);
let mut other = [0u8; 8];
unpack8(&output, &mut other, num_bits);
assert_eq!(other, random_array);
}
}

#[test]
fn test_u16_random() {
let mut rng = rand::thread_rng();
let mut random_array = [0u16; 16];
let between = Uniform::from(0..128);
for num_bits in 7..=16 {
for i in &mut random_array {
*i = between.sample(&mut rng);
}
let mut output = [0u8; 16 * 2];
pack16(&random_array, &mut output, num_bits);
let mut other = [0u16; 16];
unpack16(&output, &mut other, num_bits);
assert_eq!(other, random_array);
}
}

#[test]
fn test_u32_random() {
let mut rng = rand::thread_rng();
Expand Down
42 changes: 21 additions & 21 deletions crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,27 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//
// Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs

// This implements bit unpacking. For example, for `u8` and `num_bits=3`.
// 0b001_101_110 -> 0b0000_0001, 0b0000_0101, 0b0000_0110
//
// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very
// highly used functions in Parquet and therefore this that been extensively unrolled and
// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off
// in comparison to auto-vectorization.
//
// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of
// runtime.
//
// 1. Each individual function unrolled to a point where all constants are known to
// the compiler. In microbenchmarks, this increases the performance by around 4.5
// to 5 times.
// 2. All functions are compiled seperately and dispatch is done using a

Check warning on line 34 in crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs

View workflow job for this annotation

GitHub Actions / main

"seperately" should be "separately".
ritchie46 marked this conversation as resolved.
Show resolved Hide resolved
// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.5
// times.

/// Macro that generates an unpack function taking the number of bits as a const generic
macro_rules! unpack_impl {
($t:ty, $bytes:literal, $bits:tt) => {
Expand Down Expand Up @@ -50,7 +68,7 @@
// performance in a microbenchmark. Although the code it generates is completely
// insane. There should be something we can do here to make this less code, sane code
// and faster code.
seq_macro::seq!(i in 0..$bits {
seq_macro!(i in 0..$bits {
let start_bit = i * NUM_BITS;
let end_bit = start_bit + NUM_BITS;

Expand Down Expand Up @@ -88,7 +106,7 @@
// @NOTE
// This jumptable appoach saves around 2 - 2.5x on performance over no jumptable and no
// generics.
seq_macro::seq!(i in 0..=$bits {
seq_macro!(i in 0..=$bits {
if i == num_bits {
return $name::unpack::<i>(input, output);
}
Expand All @@ -98,8 +116,6 @@
};
}

unpack!(unpack8, u8, 1, 8);
unpack!(unpack16, u16, 2, 16);
unpack!(unpack32, u32, 4, 32);
unpack!(unpack64, u64, 8, 64);

Expand All @@ -111,22 +127,6 @@
fn test_basic() {
let input = [0xFF; 4096];

for i in 0..=8 {
let mut output = [0; 8];
unpack8(&input, &mut output, i);
for (idx, out) in output.iter().enumerate() {
assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out);
}
}

for i in 0..=16 {
let mut output = [0; 16];
unpack16(&input, &mut output, i);
for (idx, out) in output.iter().enumerate() {
assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out);
}
}

for i in 0..=32 {
let mut output = [0; 32];
unpack32(&input, &mut output, i);
Expand Down
Loading