Skip to content

Commit

Permalink
Add utf-8 validation checking for substring (#1577)
Browse files Browse the repository at this point in the history
* add utf-8 validation checking
update doc
add a test for invalid array type

Signed-off-by: remzi <13716567376yh@gmail.com>

* add tests
clean up

Signed-off-by: remzi <13716567376yh@gmail.com>

* test the worst case

Signed-off-by: remzi <13716567376yh@gmail.com>

* update doc and tests

Signed-off-by: remzi <13716567376yh@gmail.com>

* update doc

Signed-off-by: remzi <13716567376yh@gmail.com>

* use std method is_char_boundary
update doc

Signed-off-by: remzi <13716567376yh@gmail.com>

* add 2 substring benches

Signed-off-by: remzi <13716567376yh@gmail.com>

* replace dyn Fn with loop unswitching

Signed-off-by: remzi <13716567376yh@gmail.com>

* Update arrow/src/compute/kernels/substring.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
  • Loading branch information
HaoYang670 and tustvold authored Apr 19, 2022
1 parent db4cb8f commit 43f4e16
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 43 deletions.
13 changes: 8 additions & 5 deletions arrow/benches/string_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,22 @@ use arrow::array::*;
use arrow::compute::kernels::substring::substring;
use arrow::util::bench_util::*;

fn bench_substring(arr: &StringArray, start: i64, length: usize) {
substring(criterion::black_box(arr), start, Some(length as u64)).unwrap();
fn bench_substring(arr: &StringArray, start: i64, length: Option<u64>) {
substring(criterion::black_box(arr), start, length).unwrap();
}

fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let str_len = 1000;

let arr_string = create_string_array_with_len::<i32>(size, 0.0, str_len);
let start = 0;

c.bench_function("substring", |b| {
b.iter(|| bench_substring(&arr_string, start, str_len))
c.bench_function("substring (start = 0, length = None)", |b| {
b.iter(|| bench_substring(&arr_string, 0, None))
});

c.bench_function("substring (start = 1, length = str_len - 1)", |b| {
b.iter(|| bench_substring(&arr_string, 1, Some((str_len - 1) as u64)))
});
}

Expand Down
107 changes: 69 additions & 38 deletions arrow/src/compute/kernels/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};
use std::cmp::Ordering;

fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
Expand All @@ -35,37 +36,46 @@ fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
let data = values.as_slice();
let zero = OffsetSize::zero();

let cal_new_start: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> = if start
>= zero
{
// count from the start of string
Box::new(|old_start: OffsetSize, end: OffsetSize| (old_start + start).min(end))
} else {
// count from the end of string
Box::new(|old_start: OffsetSize, end: OffsetSize| (end + start).max(old_start))
// Check if `offset` is at a valid char boundary.
// If yes, return `offset`, else return error
let check_char_boundary = {
// Safety: a StringArray must contain valid UTF8 data
let data_str = unsafe { std::str::from_utf8_unchecked(data) };
|offset: OffsetSize| {
let offset_usize = offset.to_usize().unwrap();
if data_str.is_char_boundary(offset_usize) {
Ok(offset)
} else {
Err(ArrowError::ComputeError(format!(
"The offset {} is at an invalid utf-8 boundary.",
offset_usize
)))
}
}
};

let cal_new_length: Box<dyn Fn(OffsetSize, OffsetSize) -> OffsetSize> =
if let Some(length) = length {
Box::new(move |start: OffsetSize, end: OffsetSize| length.min(end - start))
} else {
Box::new(move |start: OffsetSize, end: OffsetSize| end - start)
};

// start and end offsets for each substring
// start and end offsets of all substrings
let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> =
Vec::with_capacity(array.len());
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
let mut len_so_far = zero;
new_offsets.push(zero);

offsets.windows(2).for_each(|pair| {
let new_start = cal_new_start(pair[0], pair[1]);
let new_length = cal_new_length(new_start, pair[1]);
len_so_far += new_length;
new_starts_ends.push((new_start, new_start + new_length));
offsets.windows(2).try_for_each(|pair| -> Result<()> {
let new_start = match start.cmp(&zero) {
Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
Ordering::Equal => pair[0],
Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
};
let new_end = match length {
Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
None => pair[1],
};
len_so_far += new_end - new_start;
new_starts_ends.push((new_start, new_end));
new_offsets.push(len_so_far);
});
Ok(())
})?;

// concatenate substrings into a buffer
let mut new_values =
Expand Down Expand Up @@ -107,29 +117,28 @@ fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
///
/// Attention: Both `start` and `length` are counted by byte, not by char.
///
/// # Warning
///
/// This function **might** return in invalid utf-8 format if the
/// character length falls on a non-utf8 boundary, which we
/// [hope to fix](https://github.com/apache/arrow-rs/issues/1531)
/// in a future release.
///
/// ## Example of getting an invalid substring
/// # Basic usage
/// ```
/// # // Doesn't pass due to https://github.com/apache/arrow-rs/issues/1531
/// # #[cfg(not(feature = "force_validate"))]
/// # {
/// # use arrow::array::StringArray;
/// # use arrow::compute::kernels::substring::substring;
/// let array = StringArray::from(vec![Some("E=mc²")]);
/// let result = substring(&array, -1, None).unwrap();
/// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]);
/// let result = substring(&array, 1, Some(4)).unwrap();
/// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
/// assert_eq!(result.value(0).as_bytes(), &[0x00B2]); // invalid utf-8 format
/// # }
/// assert_eq!(result, &StringArray::from(vec![Some("rrow"), None, Some("ust")]));
/// ```
///
/// # Error
/// this function errors when the passed array is not a \[Large\]String array.
/// - The function errors when the passed array is not a \[Large\]String array.
/// - The function errors if the offset of a substring in the input array is at invalid char boundary.
///
/// ## Example of trying to get an invalid utf-8 format substring
/// ```
/// # use arrow::array::StringArray;
/// # use arrow::compute::kernels::substring::substring;
/// let array = StringArray::from(vec![Some("E=mc²")]);
/// let error = substring(&array, 0, Some(5)).unwrap_err().to_string();
/// assert!(error.contains("invalid utf-8 boundary"));
/// ```
pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<ArrayRef> {
match array.data_type() {
DataType::LargeUtf8 => generic_substring(
Expand Down Expand Up @@ -304,4 +313,26 @@ mod tests {
fn without_nulls_large_string() -> Result<()> {
without_nulls::<LargeStringArray>()
}

#[test]
fn check_invalid_array_type() {
let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
let err = substring(&array, 0, None).unwrap_err().to_string();
assert!(err.contains("substring does not support type"));
}

// tests for the utf-8 validation checking
#[test]
fn check_start_index() {
let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
let err = substring(&array, -1, None).unwrap_err().to_string();
assert!(err.contains("invalid utf-8 boundary"));
}

#[test]
fn check_length() {
let array = StringArray::from(vec![Some("E=mc²"), Some("ascii")]);
let err = substring(&array, 0, Some(5)).unwrap_err().to_string();
assert!(err.contains("invalid utf-8 boundary"));
}
}

0 comments on commit 43f4e16

Please sign in to comment.