From ab90087a8fcc22de585526fbb9ecfd9bab6efe5a Mon Sep 17 00:00:00 2001 From: roee88 Date: Fri, 14 May 2021 05:29:32 +0300 Subject: [PATCH] Fix undefined behavior in FFI - Fix UB due to aliasing - Enable MIRI in CI for most tests in arrow crate Signed-off-by: roee88 --- .github/workflows/rust.yml | 5 ++-- arrow/src/array/raw_pointer.rs | 1 + arrow/src/compute/kernels/cast.rs | 1 + arrow/src/compute/kernels/cast_utils.rs | 1 + arrow/src/compute/kernels/length.rs | 2 ++ arrow/src/ffi.rs | 31 +++++++++++++------------ arrow/src/util/bit_chunk_iterator.rs | 4 ++++ arrow/src/util/integration_util.rs | 1 + 8 files changed, 29 insertions(+), 17 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 53f9ac20c3b8..dc05a4e73ecb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -259,8 +259,9 @@ jobs: export MIRIFLAGS="-Zmiri-disable-isolation" cargo miri setup cargo clean - # Ignore MIRI errors until we can get a clean run - cargo miri test || true + # Currently only the arrow crate is tested with miri + # IO related tests and some unsupported tests are skipped + cargo miri test -p arrow -- --skip csv --skip ipc --skip json lint: name: Lint diff --git a/arrow/src/array/raw_pointer.rs b/arrow/src/array/raw_pointer.rs index 185e1cbe98a7..cd1f802ae778 100644 --- a/arrow/src/array/raw_pointer.rs +++ b/arrow/src/array/raw_pointer.rs @@ -57,6 +57,7 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] + #[cfg_attr(miri, ignore)] // sometimes does not panic as expected fn test_primitive_array_alignment() { let bytes = vec![0u8, 1u8]; unsafe { RawPtrBox::::new(bytes.as_ptr().offset(1)) }; diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index de1516b0768b..d7da0629b775 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -3521,6 +3521,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // running forever fn test_can_cast_types() { // this function attempts to ensure that can_cast_types stays // in sync with cast. It simply tries all combinations of diff --git a/arrow/src/compute/kernels/cast_utils.rs b/arrow/src/compute/kernels/cast_utils.rs index a06bf421ea4f..201d20962f22 100644 --- a/arrow/src/compute/kernels/cast_utils.rs +++ b/arrow/src/compute/kernels/cast_utils.rs @@ -220,6 +220,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime fn string_to_timestamp_no_timezone() -> Result<()> { // This test is designed to succeed in regardless of the local // timezone the test machine is running. Thus it is still diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 4d704d270785..9f392e6a91cc 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -154,6 +154,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // running forever fn length_test_string() -> Result<()> { length_cases() .into_iter() @@ -170,6 +171,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // running forever fn length_test_large_string() -> Result<()> { length_cases() .into_iter() diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 7b789f89a522..42dc4407abb6 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -153,27 +153,30 @@ impl FFI_ArrowSchema { _ => vec![], }; // note: this cannot be done along with the above because the above is fallible and this op leaks. - let mut children_ptr = children_vec + let children_ptr = children_vec .into_iter() .map(Box::into_raw) .collect::>(); let n_children = children_ptr.len() as i64; - let children = children_ptr.as_mut_ptr(); + + let flags = field.is_nullable() as i64 * 2; + + let mut private = Box::new(SchemaPrivateData { + field, + children_ptr, + }); // Ok(FFI_ArrowSchema { format: CString::new(format).unwrap().into_raw(), name: CString::new(name).unwrap().into_raw(), metadata: std::ptr::null_mut(), - flags: field.is_nullable() as i64 * 2, + flags, n_children, - children, + children: private.children_ptr.as_mut_ptr(), dictionary: std::ptr::null_mut(), release: Some(release_schema), - private_data: Box::into_raw(Box::new(SchemaPrivateData { - field, - children_ptr, - })) as *mut ::std::os::raw::c_void, + private_data: Box::into_raw(private) as *mut ::std::os::raw::c_void, }) } @@ -448,7 +451,7 @@ impl FFI_ArrowArray { .collect::>(); let n_buffers = buffers.len() as i64; - let mut buffers_ptr = buffers + let buffers_ptr = buffers .iter() .map(|maybe_buffer| match maybe_buffer { // note that `raw_data` takes into account the buffer's offset @@ -456,19 +459,17 @@ impl FFI_ArrowArray { None => std::ptr::null(), }) .collect::>(); - let pointer = buffers_ptr.as_mut_ptr(); - let mut children = data + let children = data .child_data() .iter() .map(|child| Box::into_raw(Box::new(FFI_ArrowArray::new(child)))) .collect::>(); - let children_ptr = children.as_mut_ptr(); let n_children = children.len() as i64; // create the private data owning everything. // any other data must be added here, e.g. via a struct, to track lifetime. - let private_data = Box::new(PrivateData { + let mut private_data = Box::new(PrivateData { buffers, buffers_ptr, children, @@ -480,8 +481,8 @@ impl FFI_ArrowArray { offset: data.offset() as i64, n_buffers, n_children, - buffers: pointer, - children: children_ptr, + buffers: private_data.buffers_ptr.as_mut_ptr(), + children: private_data.children.as_mut_ptr(), dictionary: std::ptr::null_mut(), release: Some(release_array), private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void, diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs index b9145b7af86f..c57d0212d290 100644 --- a/arrow/src/util/bit_chunk_iterator.rs +++ b/arrow/src/util/bit_chunk_iterator.rs @@ -184,6 +184,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned() { let input: &[u8] = &[ 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, @@ -205,6 +206,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned_remainder_1_byte() { let input: &[u8] = &[ 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, @@ -226,6 +228,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned_remainder_bits_across_bytes() { let input: &[u8] = &[0b00111111, 0b11111100]; let buffer: Buffer = Buffer::from(input); @@ -239,6 +242,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned_remainder_bits_large() { let input: &[u8] = &[ 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000, diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index ec2c294cb4e1..2403151343bf 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -722,6 +722,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // running forever fn test_arrow_data_equality() { let secs_tz = Some("Europe/Budapest".to_string()); let millis_tz = Some("America/New_York".to_string());