Skip to content

Commit

Permalink
feat(rust): Speed up starts_with for small prefixes (#19904)
Browse files Browse the repository at this point in the history
  • Loading branch information
stijnherfst authored Nov 22, 2024
1 parent 132c64d commit 5b3a8f9
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 4 deletions.
2 changes: 2 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ mod reverse;
#[cfg(feature = "strings")]
mod split;
#[cfg(feature = "strings")]
mod starts_with;
#[cfg(feature = "strings")]
mod strip;
#[cfg(feature = "strings")]
mod substring;
Expand Down
32 changes: 31 additions & 1 deletion crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use arrow::array::ValueSize;
use arrow::array::{Array, ValueSize};
use arrow::legacy::kernels::string::*;
#[cfg(feature = "string_encoding")]
use base64::engine::general_purpose;
Expand All @@ -14,6 +14,7 @@ use regex::escape;
use super::*;
#[cfg(feature = "binary_encoding")]
use crate::chunked_array::binary::BinaryNameSpaceImpl;
use crate::prelude::strings::starts_with::starts_with_str;

// We need this to infer the right lifetimes for the match closure.
#[inline(always)]
Expand Down Expand Up @@ -216,6 +217,35 @@ pub trait StringNameSpaceImpl: AsString {
}
}

/// Check if strings starts with a substring
fn starts_with(&self, sub: &str) -> BooleanChunked {
let ca = self.as_string();
let iter = ca.downcast_iter().map(|arr| {
let out: <BooleanType as PolarsDataType>::Array = arr
.views()
.iter()
.map(|view| starts_with_str(*view, sub, arr.data_buffers()))
.collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest()));

out.with_validity_typed(arr.validity().cloned())
});

ChunkedArray::from_chunk_iter(ca.name().clone(), iter)
}

/// This is more performant than the BinaryChunked version because we use the inline prefix
/// Use the BinaryChunked::ends_with as there is no specialization here for that
fn starts_with_chunked(&self, prefix: &StringChunked) -> BooleanChunked {
let ca = self.as_string();
match prefix.len() {
1 => match prefix.get(0) {
Some(s) => self.starts_with(s),
None => BooleanChunked::full_null(ca.name().clone(), ca.len()),
},
_ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)),
}
}

/// Get the length of the string values as number of chars.
fn str_len_chars(&self) -> UInt32Chunked {
let ca = self.as_string();
Expand Down
25 changes: 25 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/starts_with.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use arrow::array::View;
use arrow::buffer::Buffer;
use polars_utils::slice::SliceAble;

/// Checks if the string starts with the prefix
/// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast
pub(crate) fn starts_with_str(view: View, prefix: &str, buffers: &[Buffer<u8>]) -> bool {
unsafe {
if view.length <= View::MAX_INLINE_SIZE {
view.get_inlined_slice_unchecked()
.starts_with(prefix.as_bytes())
} else {
let starts = view
.prefix
.to_le_bytes()
.starts_with(prefix.as_bytes().slice_unchecked(0..4));
if starts {
return view
.get_slice_unchecked(buffers)
.starts_with(prefix.as_bytes());
}
false
}
}
}
5 changes: 2 additions & 3 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,8 @@ pub(super) fn ends_with(s: &[Column]) -> PolarsResult<Column> {
}

pub(super) fn starts_with(s: &[Column]) -> PolarsResult<Column> {
let ca = &s[0].str()?.as_binary();
let prefix = &s[1].str()?.as_binary();

let ca = s[0].str()?;
let prefix = s[1].str()?;
Ok(ca.starts_with_chunked(prefix).into_column())
}

Expand Down

0 comments on commit 5b3a8f9

Please sign in to comment.