Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SIMD comparison operations for types with less than 4 lanes (i128) #1146

Merged
merged 6 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use criterion::Criterion;
extern crate arrow;

use arrow::compute::*;
use arrow::datatypes::ArrowNumericType;
use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
use arrow::util::bench_util::*;
use arrow::{array::*, datatypes::Float32Type};

Expand Down Expand Up @@ -138,6 +138,11 @@ fn add_benchmark(c: &mut Criterion) {
let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
let arr_b = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 43);

let arr_month_day_nano_a =
create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size, 0.0, 43);
let arr_month_day_nano_b =
create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size, 0.0, 43);

let arr_string = create_string_array::<i32>(size, 0.0);

c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b)));
Expand Down Expand Up @@ -170,6 +175,13 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_gt_eq_scalar(&arr_a, 1.0))
});

c.bench_function("eq MonthDayNano", |b| {
b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b))
});
c.bench_function("eq scalar MonthDayNano", |b| {
b.iter(|| bench_eq_scalar(&arr_month_day_nano_a, 123))
});

c.bench_function("like_utf8 scalar equals", |b| {
b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx"))
});
Expand Down
122 changes: 70 additions & 52 deletions arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1471,53 +1471,65 @@ where

let null_bit_buffer = combine_option_bitmap(left.data_ref(), right.data_ref(), len)?;

// we process the data in chunks so that each iteration results in one u64 of comparison result bits
const CHUNK_SIZE: usize = 64;
let lanes = T::lanes();

// this is currently the case for all our datatypes and allows us to always append full bytes
assert!(
lanes <= CHUNK_SIZE,
"Number of vector lanes must be at most 64"
);

let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);

// this is currently the case for all our datatypes and allows us to always append full bytes
assert_eq!(lanes % 8, 0, "Number of vector lanes must be multiple of 8");
let mut left_chunks = left.values().chunks_exact(lanes);
let mut right_chunks = right.values().chunks_exact(lanes);
let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
let mut right_chunks = right.values().chunks_exact(CHUNK_SIZE);

// safety: result is newly created above, always written as a T below
let result_chunks = unsafe { result.typed_data_mut() };
let result_remainder = left_chunks
.borrow_mut()
.zip(right_chunks.borrow_mut())
.fold(result_chunks, |result_slice, (left_slice, right_slice)| {
let simd_left = T::load(left_slice);
let simd_right = T::load(right_slice);
let simd_result = simd_op(simd_left, simd_right);
let mut i = 0;
let mut bitmask = 0_u64;
while i < CHUNK_SIZE {
let simd_left = T::load(&left_slice[i..]);
let simd_right = T::load(&right_slice[i..]);
let simd_result = simd_op(simd_left, simd_right);

let bitmask = T::mask_to_u64(&simd_result);
let m = T::mask_to_u64(&simd_result);
bitmask |= m << (i / lanes);

i += lanes;
}
let bytes = bitmask.to_le_bytes();
result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]);
result_slice[0..8].copy_from_slice(&bytes);

&mut result_slice[lanes / 8..]
&mut result_slice[8..]
});

let left_remainder = left_chunks.remainder();
let right_remainder = right_chunks.remainder();

assert_eq!(left_remainder.len(), right_remainder.len());

let remainder_bitmask = left_remainder
.iter()
.zip(right_remainder.iter())
.enumerate()
.fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
let bit = if scalar_op(*scalar_left, *scalar_right) {
1_u64
} else {
0_u64
};
mask |= bit << i;
mask
});
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
if !left_remainder.is_empty() {
let remainder_bitmask = left_remainder
.iter()
.zip(right_remainder.iter())
.enumerate()
.fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
let bit = scalar_op(*scalar_left, *scalar_right) as u64;
mask |= bit << i;
mask
});
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
}

let data = unsafe {
ArrayData::new_unchecked(
Expand Down Expand Up @@ -1551,16 +1563,20 @@ where

let len = left.len();

// we process the data in chunks so that each iteration results in one u64 of comparison result bits
const CHUNK_SIZE: usize = 64;
let lanes = T::lanes();
let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);

// this is currently the case for all our datatypes and allows us to always append full bytes
assert!(
lanes % 8 == 0,
"Number of vector lanes must be multiple of 8"
lanes <= CHUNK_SIZE,
"Number of vector lanes must be at most 64"
);
let mut left_chunks = left.values().chunks_exact(lanes);

let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);

let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
let simd_right = T::init(right);

// safety: result is newly created above, always written as a T below
Expand All @@ -1569,34 +1585,38 @@ where
left_chunks
.borrow_mut()
.fold(result_chunks, |result_slice, left_slice| {
let simd_left = T::load(left_slice);
let simd_result = simd_op(simd_left, simd_right);
let mut i = 0;
let mut bitmask = 0_u64;
while i < CHUNK_SIZE {
let simd_left = T::load(&left_slice[i..]);
let simd_result = simd_op(simd_left, simd_right);

let m = T::mask_to_u64(&simd_result);
bitmask |= m << (i / lanes);

let bitmask = T::mask_to_u64(&simd_result);
i += lanes;
}
let bytes = bitmask.to_le_bytes();
result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]);
result_slice[0..8].copy_from_slice(&bytes);

&mut result_slice[lanes / 8..]
&mut result_slice[8..]
});

let left_remainder = left_chunks.remainder();

let remainder_bitmask =
left_remainder
.iter()
.enumerate()
.fold(0_u64, |mut mask, (i, scalar_left)| {
let bit = if scalar_op(*scalar_left, right) {
1_u64
} else {
0_u64
};
if !left_remainder.is_empty() {
let remainder_bitmask = left_remainder.iter().enumerate().fold(
0_u64,
|mut mask, (i, scalar_left)| {
let bit = scalar_op(*scalar_left, right) as u64;
mask |= bit << i;
mask
});
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
},
);
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
}

let null_bit_buffer = left
.data_ref()
Expand Down Expand Up @@ -2723,8 +2743,6 @@ mod tests {
);
}

// Fails when simd is enabled: https://github.com/apache/arrow-rs/issues/1136
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

#[cfg(not(feature = "simd"))]
#[test]
fn test_interval_array() {
let a = IntervalDayTimeArray::from(
Expand Down
29 changes: 27 additions & 2 deletions arrow/src/datatypes/numeric.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,20 @@ macro_rules! make_numeric_type {
// this match will get removed by the compiler since the number of lanes is known at
// compile-time for each concrete numeric type
match Self::lanes() {
4 => {
// the bit position in each lane indicates the index of that lane
let vecidx = i128x4::new(1, 2, 4, 8);

// broadcast the lowermost 8 bits of mask to each lane
let vecmask = i128x4::splat((mask & 0x0F) as i128);
// compute whether the bit corresponding to each lanes index is set
let vecmask = (vecidx & vecmask).eq(vecidx);

// transmute is necessary because the different match arms return different
// mask types, at runtime only one of those expressions will exist per type,
// with the type being equal to `SimdMask`.
unsafe { std::mem::transmute(vecmask) }
}
8 => {
// the bit position in each lane indicates the index of that lane
let vecidx = i64x8::new(1, 2, 4, 8, 16, 32, 64, 128);
Expand Down Expand Up @@ -448,11 +462,11 @@ macro_rules! make_float_numeric_type {
make_float_numeric_type!(Float32Type, f32x16);
make_float_numeric_type!(Float64Type, f64x8);

#[cfg(all(test, simd_x86))]
#[cfg(all(test, feature = "simd"))]
mod tests {
use crate::datatypes::{
ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type, Int8Type,
UInt16Type,
IntervalMonthDayNanoType, UInt16Type,
};
use packed_simd::*;
use FromCast;
Expand All @@ -470,6 +484,17 @@ mod tests {
}};
}

#[test]
fn test_mask_i128() {
let mask = 0b1101;
let actual = IntervalMonthDayNanoType::mask_from_u64(mask);
let expected = expected_mask!(i128, mask);
let expected =
m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice()));

assert_eq!(expected, actual);
}

#[test]
fn test_mask_f64() {
let mask = 0b10101010;
Expand Down