diff --git a/crates/polars-ops/src/chunked_array/strings/mod.rs b/crates/polars-ops/src/chunked_array/strings/mod.rs index 326349c36815..f7a7e080ffcb 100644 --- a/crates/polars-ops/src/chunked_array/strings/mod.rs +++ b/crates/polars-ops/src/chunked_array/strings/mod.rs @@ -19,6 +19,8 @@ mod reverse; #[cfg(feature = "strings")] mod split; #[cfg(feature = "strings")] +mod starts_with; +#[cfg(feature = "strings")] mod strip; #[cfg(feature = "strings")] mod substring; diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 93574e5f3080..e2ab82ac3aca 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -1,4 +1,4 @@ -use arrow::array::ValueSize; +use arrow::array::{Array, ValueSize}; use arrow::legacy::kernels::string::*; #[cfg(feature = "string_encoding")] use base64::engine::general_purpose; @@ -14,6 +14,7 @@ use regex::escape; use super::*; #[cfg(feature = "binary_encoding")] use crate::chunked_array::binary::BinaryNameSpaceImpl; +use crate::prelude::strings::starts_with::starts_with_str; // We need this to infer the right lifetimes for the match closure. #[inline(always)] @@ -216,6 +217,35 @@ pub trait StringNameSpaceImpl: AsString { } } + /// Check if strings starts with a substring + fn starts_with(&self, sub: &str) -> BooleanChunked { + let ca = self.as_string(); + let iter = ca.downcast_iter().map(|arr| { + let out: ::Array = arr + .views() + .iter() + .map(|view| starts_with_str(*view, sub, arr.data_buffers())) + .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); + + out.with_validity_typed(arr.validity().cloned()) + }); + + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) + } + + /// This is more performant than the BinaryChunked version because we use the inline prefix + /// Use the BinaryChunked::ends_with as there is no specialization here for that + fn starts_with_chunked(&self, prefix: &StringChunked) -> BooleanChunked { + let ca = self.as_string(); + match prefix.len() { + 1 => match prefix.get(0) { + Some(s) => self.starts_with(s), + None => BooleanChunked::full_null(ca.name().clone(), ca.len()), + }, + _ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)), + } + } + /// Get the length of the string values as number of chars. fn str_len_chars(&self) -> UInt32Chunked { let ca = self.as_string(); diff --git a/crates/polars-ops/src/chunked_array/strings/starts_with.rs b/crates/polars-ops/src/chunked_array/strings/starts_with.rs new file mode 100644 index 000000000000..91096931125b --- /dev/null +++ b/crates/polars-ops/src/chunked_array/strings/starts_with.rs @@ -0,0 +1,25 @@ +use arrow::array::View; +use arrow::buffer::Buffer; +use polars_utils::slice::SliceAble; + +/// Checks if the string starts with the prefix +/// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast +pub(crate) fn starts_with_str(view: View, prefix: &str, buffers: &[Buffer]) -> bool { + unsafe { + if view.length <= View::MAX_INLINE_SIZE { + view.get_inlined_slice_unchecked() + .starts_with(prefix.as_bytes()) + } else { + let starts = view + .prefix + .to_le_bytes() + .starts_with(prefix.as_bytes().slice_unchecked(0..4)); + if starts { + return view + .get_slice_unchecked(buffers) + .starts_with(prefix.as_bytes()); + } + false + } + } +} diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 039c995557be..48fd35725fde 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -502,9 +502,8 @@ pub(super) fn ends_with(s: &[Column]) -> PolarsResult { } pub(super) fn starts_with(s: &[Column]) -> PolarsResult { - let ca = &s[0].str()?.as_binary(); - let prefix = &s[1].str()?.as_binary(); - + let ca = s[0].str()?; + let prefix = s[1].str()?; Ok(ca.starts_with_chunked(prefix).into_column()) }