From e1eb46d1d4a54cd117d23e818fd488cc914c0f99 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sat, 29 Jun 2024 16:32:35 +0200
Subject: [PATCH] refactor(rust): remove seqmacro and u8,u16 bitpack

After looking for a long time at how to SIMD this bitpacking, I am pausing for a bit to work on something else. There are some code deletions, namely removing the unused u8 and u16 implementations of bit(un)packing. This also removes the `seq_macro` crate.

Maybe, I will look at this again in the future.
---
 Cargo.lock                                    |  7 --
 crates/polars-parquet/Cargo.toml              |  1 -
 .../src/parquet/encoding/bitpacked/mod.rs     | 84 ++++++++++++-------
 .../src/parquet/encoding/bitpacked/pack.rs    | 52 +-----------
 .../src/parquet/encoding/bitpacked/unpack.rs  | 42 +++++-----
 5 files changed, 77 insertions(+), 109 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 66aad84a20d6..2cc791536a93 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3282,7 +3282,6 @@ dependencies = [
  "polars-error",
  "polars-utils",
  "rand",
- "seq-macro",
  "serde",
  "simdutf8",
  "snap",
@@ -4266,12 +4265,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "seq-macro"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
-
 [[package]]
 name = "serde"
 version = "1.0.203"
diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml
index b30d86df794b..cdf727481976 100644
--- a/crates/polars-parquet/Cargo.toml
+++ b/crates/polars-parquet/Cargo.toml
@@ -26,7 +26,6 @@ polars-utils = { workspace = true }
 simdutf8 = { workspace = true }
 
 parquet-format-safe = "0.2"
-seq-macro = { version = "0.3", default-features = false }
 streaming-decompression = "0.1"
 
 async-stream = { version = "0.3.3", optional = true }
diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs
index 4d80a43ed879..ef6c5313ba26 100644
--- a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs
+++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs
@@ -1,3 +1,57 @@
+macro_rules! seq_macro {
+    ($i:ident in 1..31 $block:block) => {
+        seq_macro!($i in [
+                 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+        ] $block)
+    };
+    ($i:ident in 0..32 $block:block) => {
+        seq_macro!($i in [
+             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        ] $block)
+    };
+    ($i:ident in 0..=32 $block:block) => {
+        seq_macro!($i in [
+             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32,
+        ] $block)
+    };
+    ($i:ident in 1..63 $block:block) => {
+        seq_macro!($i in [
+                 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+        ] $block)
+    };
+    ($i:ident in 0..64 $block:block) => {
+        seq_macro!($i in [
+             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+        ] $block)
+    };
+    ($i:ident in 0..=64 $block:block) => {
+        seq_macro!($i in [
+             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+            64,
+        ] $block)
+    };
+    ($i:ident in [$($value:literal),+ $(,)?] $block:block) => {
+        $({
+            #[allow(non_upper_case_globals)]
+            const $i: usize = $value;
+            { $block }
+        })+
+    };
+}
+
 mod decode;
 mod encode;
 mod pack;
@@ -105,36 +159,6 @@ pub trait Unpackable: Copy + Sized + Default {
     fn pack(unpacked: &Self::Unpacked, num_bits: usize, packed: &mut [u8]);
 }
 
-impl Unpackable for u8 {
-    type Packed = [u8; 8];
-    type Unpacked = [u8; 8];
-
-    #[inline]
-    fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) {
-        unpack::unpack8(packed, unpacked, num_bits)
-    }
-
-    #[inline]
-    fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) {
-        pack::pack8(packed, unpacked, num_bits)
-    }
-}
-
-impl Unpackable for u16 {
-    type Packed = [u8; 16 * 2];
-    type Unpacked = [u16; 16];
-
-    #[inline]
-    fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) {
-        unpack::unpack16(packed, unpacked, num_bits)
-    }
-
-    #[inline]
-    fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) {
-        pack::pack16(packed, unpacked, num_bits)
-    }
-}
-
 impl Unpackable for u32 {
     type Packed = [u8; 32 * 4];
     type Unpacked = [u32; 32];
diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs
index bb8263ac4b23..c318f42649d3 100644
--- a/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs
+++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs
@@ -23,7 +23,7 @@ macro_rules! pack_impl {
 
             // Using microbenchmark (79d1fff), unrolling this loop is over 10x
             // faster than not (>20x faster than old algorithm)
-            seq_macro::seq!(i in 1..$bits_minus_one {
+            seq_macro!(i in 1..$bits_minus_one {
                 let bits_filled: usize = i * NUM_BITS;
                 let inner_cursor: usize = bits_filled % $bits;
                 let remaining: usize = $bits - inner_cursor;
@@ -69,7 +69,7 @@ macro_rules! pack {
         /// Pack unpacked `input` into `output` with a bit width of `num_bits`
         pub fn $name(input: &[$t; $bits], output: &mut [u8], num_bits: usize) {
             // This will get optimised into a jump table
-            seq_macro::seq!(i in 0..=$bits {
+            seq_macro!(i in 0..=$bits {
                 if i == num_bits {
                     unsafe {
                         return $name::pack::<i>(input, output);
@@ -81,8 +81,6 @@ macro_rules! pack {
     };
 }
 
-pack!(pack8, u8, 1, 8, 7);
-pack!(pack16, u16, 2, 16, 15);
 pack!(pack32, u32, 4, 32, 31);
 pack!(pack64, u64, 8, 64, 63);
 
@@ -93,18 +91,6 @@ mod tests {
     use super::super::unpack::*;
     use super::*;
 
-    #[test]
-    fn test_basic() {
-        let input = [0u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-        for num_bits in 4..16 {
-            let mut output = [0u8; 16 * 2];
-            pack16(&input, &mut output, num_bits);
-            let mut other = [0u16; 16];
-            unpack16(&output, &mut other, num_bits);
-            assert_eq!(other, input);
-        }
-    }
-
     #[test]
     fn test_u32() {
         let input = [
@@ -120,40 +106,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_u8_random() {
-        let mut rng = rand::thread_rng();
-        let mut random_array = [0u8; 8];
-        let between = Uniform::from(0..6);
-        for num_bits in 3..=8 {
-            for i in &mut random_array {
-                *i = between.sample(&mut rng);
-            }
-            let mut output = [0u8; 8];
-            pack8(&random_array, &mut output, num_bits);
-            let mut other = [0u8; 8];
-            unpack8(&output, &mut other, num_bits);
-            assert_eq!(other, random_array);
-        }
-    }
-
-    #[test]
-    fn test_u16_random() {
-        let mut rng = rand::thread_rng();
-        let mut random_array = [0u16; 16];
-        let between = Uniform::from(0..128);
-        for num_bits in 7..=16 {
-            for i in &mut random_array {
-                *i = between.sample(&mut rng);
-            }
-            let mut output = [0u8; 16 * 2];
-            pack16(&random_array, &mut output, num_bits);
-            let mut other = [0u16; 16];
-            unpack16(&output, &mut other, num_bits);
-            assert_eq!(other, random_array);
-        }
-    }
-
     #[test]
     fn test_u32_random() {
         let mut rng = rand::thread_rng();
diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
index ab6dbc00d5a5..44bbedee7a31 100644
--- a/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
+++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
@@ -14,9 +14,27 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+//
 // Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs
 
+// This implements bit unpacking. For example, for `u8` and `num_bits=3`.
+// 0b001_101_110  -> 0b0000_0001, 0b0000_0101, 0b0000_0110
+//
+// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very
+// highly used functions in Parquet and therefore this that been extensively unrolled and
+// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off
+// in comparison to auto-vectorization.
+//
+// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of
+// runtime.
+//
+// 1. Each individual function unrolled to a point where all constants are known to
+// the compiler. In microbenchmarks, this increases the performance by around 4.5
+// to 5 times.
+// 2. All functions are compiled seperately and dispatch is done using a
+// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.5
+// times.
+
 /// Macro that generates an unpack function taking the number of bits as a const generic
 macro_rules! unpack_impl {
     ($t:ty, $bytes:literal, $bits:tt) => {
@@ -50,7 +68,7 @@ macro_rules! unpack_impl {
             // performance in a microbenchmark. Although the code it generates is completely
             // insane. There should be something we can do here to make this less code, sane code
             // and faster code.
-            seq_macro::seq!(i in 0..$bits {
+            seq_macro!(i in 0..$bits {
                 let start_bit = i * NUM_BITS;
                 let end_bit = start_bit + NUM_BITS;
 
@@ -88,7 +106,7 @@ macro_rules! unpack {
             // @NOTE
             // This jumptable appoach saves around 2 - 2.5x on performance over no jumptable and no
             // generics.
-            seq_macro::seq!(i in 0..=$bits {
+            seq_macro!(i in 0..=$bits {
                 if i == num_bits {
                     return $name::unpack::<i>(input, output);
                 }
@@ -98,8 +116,6 @@ macro_rules! unpack {
     };
 }
 
-unpack!(unpack8, u8, 1, 8);
-unpack!(unpack16, u16, 2, 16);
 unpack!(unpack32, u32, 4, 32);
 unpack!(unpack64, u64, 8, 64);
 
@@ -111,22 +127,6 @@ mod tests {
     fn test_basic() {
         let input = [0xFF; 4096];
 
-        for i in 0..=8 {
-            let mut output = [0; 8];
-            unpack8(&input, &mut output, i);
-            for (idx, out) in output.iter().enumerate() {
-                assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out);
-            }
-        }
-
-        for i in 0..=16 {
-            let mut output = [0; 16];
-            unpack16(&input, &mut output, i);
-            for (idx, out) in output.iter().enumerate() {
-                assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out);
-            }
-        }
-
         for i in 0..=32 {
             let mut output = [0; 32];
             unpack32(&input, &mut output, i);