Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix fused delta #31

Merged
merged 5 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 11 additions & 25 deletions benches/bitpacking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@ use std::mem::size_of;

use arrayref::{array_mut_ref, array_ref};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use fastlanes::{bitpack, BitPacking, FastLanes};
use fastlanes::BitPacking;

fn bitpacking(c: &mut Criterion) {
fn pack(c: &mut Criterion) {
{
let mut group = c.benchmark_group("bit-packing");
let mut group = c.benchmark_group("pack");
group.bench_function("pack 16 -> 3 heap", |b| {
const WIDTH: usize = 3;
let values = vec![3u16; 1024];
let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];

b.iter(|| {
BitPacking::bitpack::<WIDTH>(
BitPacking::pack::<WIDTH>(
array_ref![values, 0, 1024],
array_mut_ref![packed, 0, 192],
);
Expand All @@ -27,34 +27,20 @@ fn bitpacking(c: &mut Criterion) {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
b.iter(|| BitPacking::bitpack::<WIDTH>(&values, &mut packed));
});

group.bench_function("pack 16 -> 3 alternate", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
b.iter(|| {
for lane in 0..u16::LANES {
// Always loop over lanes first. This is what the compiler vectorizes.
bitpack!(u16, WIDTH, packed, lane, |$pos| {
values[$pos]
});
}
});
b.iter(|| BitPacking::pack::<WIDTH>(&values, &mut packed));
});
}

{
let mut group = c.benchmark_group("bit-unpacking");
let mut group = c.benchmark_group("unpack");
group.bench_function("unpack 16 <- 3 stack", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
BitPacking::bitpack::<WIDTH>(&values, &mut packed);
BitPacking::pack::<WIDTH>(&values, &mut packed);

let mut unpacked = [0u16; 1024];
b.iter(|| BitPacking::bitunpack::<WIDTH>(&packed, &mut unpacked));
b.iter(|| BitPacking::unpack::<WIDTH>(&packed, &mut unpacked));
});
}

Expand All @@ -64,16 +50,16 @@ fn bitpacking(c: &mut Criterion) {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
BitPacking::bitpack::<WIDTH>(&values, &mut packed);
BitPacking::pack::<WIDTH>(&values, &mut packed);

b.iter(|| {
for i in 0..1024 {
black_box::<u16>(BitPacking::bitunpack_single::<WIDTH>(&packed, i));
black_box::<u16>(BitPacking::unpack_single::<WIDTH>(&packed, i));
}
});
});
}
}

criterion_group!(benches, bitpacking);
criterion_group!(benches, pack);
criterion_main!(benches);
20 changes: 12 additions & 8 deletions benches/delta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,25 @@ fn delta(c: &mut Criterion) {
let mut transposed = [0; 1024];
Transpose::transpose(&values, &mut transposed);

let mut deltas = [0; 1024];
Delta::delta(&transposed, &[0; 64], &mut deltas);

let mut packed = [0; 128 * W / size_of::<u16>()];
BitPacking::pack::<W>(&deltas, &mut packed);

group.bench_function("delta u16 fused", |b| {
b.iter(|| {
let mut packed = [0; 128 * W / size_of::<u16>()];
Delta::delta::<W>(&transposed, &[0; 64], &mut packed)
let mut unpacked = [0; 1024];
Delta::undelta_pack::<W>(&packed, &[0; 64], &mut unpacked)
});
});

group.bench_function("delta u16 unfused", |b| {
b.iter(|| {
let mut delta = [0; 1024];
// Using width == 16 does not bit-packing
Delta::delta::<16>(&transposed, &[0; 64], &mut delta);

let mut packed = [0; 128 * W / size_of::<u16>()];
BitPacking::bitpack::<W>(&delta, &mut packed);
let mut unpacked = [0; 1024];
BitPacking::unpack::<W>(&packed, &mut unpacked);
let mut undelta = [0; 1024];
Delta::undelta(&unpacked, &[0; 64], &mut undelta);
});
});
}
Expand Down
70 changes: 35 additions & 35 deletions src/bitpacking.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{bitpack, bitunpack, seq_t, FastLanes, Pred, Satisfied};
use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied};
use arrayref::{array_mut_ref, array_ref};
use core::mem::size_of;
use num_traits::One;
Expand All @@ -15,63 +15,63 @@ impl<const W: usize, T> SupportedBitPackWidth<T> for BitPackWidth<W> where
pub trait BitPacking: FastLanes {
/// Packs 1024 elements into W bits each.
/// The output is given as Self to ensure correct alignment.
fn bitpack<const W: usize>(input: &[Self; 1024], output: &mut [Self; 1024 * W / Self::T])
fn pack<const W: usize>(input: &[Self; 1024], output: &mut [Self; 1024 * W / Self::T])
where
BitPackWidth<W>: SupportedBitPackWidth<Self>;

unsafe fn unchecked_bitpack(width: usize, input: &[Self], output: &mut [Self]);
unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]);

/// Unpacks W-bit elements into 1024 elements.
fn bitunpack<const W: usize>(input: &[Self; 1024 * W / Self::T], output: &mut [Self; 1024])
fn unpack<const W: usize>(input: &[Self; 1024 * W / Self::T], output: &mut [Self; 1024])
where
BitPackWidth<W>: SupportedBitPackWidth<Self>;

unsafe fn unchecked_bitunpack(width: usize, input: &[Self], output: &mut [Self]);
unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]);

fn bitunpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
where
BitPackWidth<W>: SupportedBitPackWidth<Self>,
Self: One,
{
// TODO(ngates): implement this function to not unpack the world.
let mut output = [Self::zero(); 1024];
Self::bitunpack::<W>(packed, &mut output);
Self::unpack::<W>(packed, &mut output);
output[index]
}

unsafe fn unchecked_bitunpack_single(width: usize, input: &[Self], index: usize) -> Self;
unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self;
}

macro_rules! impl_bitpacking {
macro_rules! impl_packing {
($T:ty) => {
paste! {
impl BitPacking for $T {
#[inline(never)] // Makes it easier to disassemble and validate ASM.
fn bitpack<const W: usize>(
fn pack<const W: usize>(
input: &[Self; 1024],
output: &mut [Self; 1024 * W / Self::T],
) where BitPackWidth<W>: SupportedBitPackWidth<Self> {
for lane in 0..Self::LANES {
bitpack!($T, W, output, lane, |$idx| {
pack!($T, W, output, lane, |$idx| {
input[$idx]
});
}
}

unsafe fn unchecked_bitpack(width: usize, input: &[Self], output: &mut [Self]) {
unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
let packed_len = 128 * width / size_of::<Self>();
debug_assert_eq!(output.len(), packed_len, "Output buffer must be of size 1024 * W / T");
debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);

seq_t!(W in $T {
match width {
#(W => Self::bitpack::<W>(
#(W => Self::pack::<W>(
array_ref![input, 0, 1024],
array_mut_ref![output, 0, 1024 * W / <$T>::T],
),)*
// seq_t has exclusive upper bound
Self::T => Self::bitpack::<{ Self::T }>(
Self::T => Self::pack::<{ Self::T }>(
array_ref![input, 0, 1024],
array_mut_ref![output, 0, 1024],
),
Expand All @@ -81,31 +81,31 @@ macro_rules! impl_bitpacking {
}

#[inline(never)]
fn bitunpack<const W: usize>(
fn unpack<const W: usize>(
input: &[Self; 1024 * W / Self::T],
output: &mut [Self; 1024],
) where BitPackWidth<W>: SupportedBitPackWidth<Self> {
for lane in 0..Self::LANES {
bitunpack!($T, W, input, lane, |$idx, $elem| {
unpack!($T, W, input, lane, |$idx, $elem| {
output[$idx] = $elem
});
}
}

unsafe fn unchecked_bitunpack(width: usize, input: &[Self], output: &mut [Self]) {
unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
let packed_len = 128 * width / size_of::<Self>();
debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size 1024 * W / T");
debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);

seq_t!(W in $T {
match width {
#(W => Self::bitunpack::<W>(
#(W => Self::unpack::<W>(
array_ref![input, 0, 1024 * W / <$T>::T],
array_mut_ref![output, 0, 1024],
),)*
// seq_t has exclusive upper bound
Self::T => Self::bitunpack::<{ Self::T }>(
Self::T => Self::unpack::<{ Self::T }>(
array_ref![input, 0, 1024],
array_mut_ref![output, 0, 1024],
),
Expand All @@ -114,20 +114,20 @@ macro_rules! impl_bitpacking {
})
}

unsafe fn unchecked_bitunpack_single(width: usize, input: &[Self], index: usize) -> Self {
unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self {
let packed_len = 128 * width / size_of::<Self>();
debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len);
debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);
debug_assert!(index <= 1024, "index must be less than or equal to 1024");

seq_t!(W in $T {
match width {
#(W => Self::bitunpack_single::<W>(
#(W => Self::unpack_single::<W>(
array_ref![input, 0, 1024 * W / <$T>::T],
index,
),)*
// seq_t has exclusive upper bound
Self::T => Self::bitunpack_single::<{ Self::T }>(
Self::T => Self::unpack_single::<{ Self::T }>(
array_ref![input, 0, 1024],
index,
),
Expand All @@ -140,10 +140,10 @@ macro_rules! impl_bitpacking {
};
}

impl_bitpacking!(u8);
impl_bitpacking!(u16);
impl_bitpacking!(u32);
impl_bitpacking!(u64);
impl_packing!(u8);
impl_packing!(u16);
impl_packing!(u32);
impl_packing!(u64);

#[cfg(test)]
mod test {
Expand All @@ -154,25 +154,25 @@ mod test {
use seq_macro::seq;

#[test]
fn test_unchecked_bitpack() {
fn test_unchecked_pack() {
let input = array::from_fn(|i| i as u32);
let mut packed = [0; 320];
unsafe { BitPacking::unchecked_bitpack(10, &input, &mut packed) };
unsafe { BitPacking::unchecked_pack(10, &input, &mut packed) };
let mut output = [0; 1024];
unsafe { BitPacking::unchecked_bitunpack(10, &packed, &mut output) };
unsafe { BitPacking::unchecked_unpack(10, &packed, &mut output) };
assert_eq!(input, output);
}

#[test]
fn test_bitunpack_single() {
fn test_unpack_single() {
let values = array::from_fn(|i| i as u32);
let mut packed = [0; 512];
BitPacking::bitpack::<16>(&values, &mut packed);
BitPacking::pack::<16>(&values, &mut packed);

for i in 0..1024 {
assert_eq!(BitPacking::bitunpack_single::<16>(&packed, i), values[i]);
assert_eq!(BitPacking::unpack_single::<16>(&packed, i), values[i]);
assert_eq!(
unsafe { BitPacking::unchecked_bitunpack_single(16, &packed, i) },
unsafe { BitPacking::unchecked_unpack_single(16, &packed, i) },
values[i]
);
}
Expand All @@ -190,10 +190,10 @@ mod test {
}

let mut packed = [T::zero(); 1024 * W / T::T];
BitPacking::bitpack::<W>(&values, &mut packed);
BitPacking::pack::<W>(&values, &mut packed);

let mut unpacked = [T::zero(); 1024];
BitPacking::bitunpack::<W>(&packed, &mut unpacked);
BitPacking::unpack::<W>(&packed, &mut unpacked);

assert_eq!(&unpacked, &values);
}
Expand Down
Loading
Loading