From 099465946e8650c8efd4c86d49a93944344be331 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 10:17:11 +0100 Subject: [PATCH 01/10] Speed up starts_with for small prefixes --- crates/polars-arrow/src/array/binview/view.rs | 16 ++++++++++ .../src/chunked_array/strings/namespace.rs | 32 +++++++++++++++++-- .../src/dsl/function_expr/strings.rs | 5 ++- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index 70285aacead5..b8abb663b648 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -136,6 +136,22 @@ impl View { } } + /// Checks if the string starts with the prefix + /// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast + pub fn starts_with<'a>(&self, prefix: &str, buffers: &'a [Buffer]) -> bool { + unsafe { + if self.length <= View::MAX_INLINE_SIZE { + self.get_inlined_slice_unchecked().starts_with(prefix.as_bytes()) + } else { + let starts = self.prefix.to_le_bytes().starts_with(&prefix.as_bytes()[0..4]); + if starts { + return self.get_slice_unchecked(buffers).starts_with(prefix.as_bytes()); + } + false + } + } + } + /// Constructs a byteslice from this view. /// /// # Safety diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 93574e5f3080..5838dcb9f7ac 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -1,4 +1,4 @@ -use arrow::array::ValueSize; +use arrow::array::{Array, ValueSize}; use arrow::legacy::kernels::string::*; #[cfg(feature = "string_encoding")] use base64::engine::general_purpose; @@ -10,7 +10,6 @@ use polars_core::export::regex::Regex; use polars_core::prelude::arity::*; use polars_utils::cache::FastFixedCache; use regex::escape; - use super::*; #[cfg(feature = "binary_encoding")] use crate::chunked_array::binary::BinaryNameSpaceImpl; @@ -216,6 +215,35 @@ pub trait StringNameSpaceImpl: AsString { } } + /// Check if strings starts with a substring + fn starts_with(&self, sub: &str) -> BooleanChunked { + let ca = self.as_string(); + + unsafe { + let iter = ca.downcast_iter().map(|arr| { + let out: ::Array = arr.views().iter().map(|view| { + view.starts_with(sub, arr.data_buffers()) + }).collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); + out.with_validity_typed(arr.validity().cloned()) + }); + + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) + } + } + + /// This is more performant than the BinaryChunked version because we use the inline prefix + /// Use the BinaryChunked::ends_with as there is no specialization here for that + fn starts_with_chunked(&self, prefix: &StringChunked) -> BooleanChunked { + let ca = self.as_string(); + match prefix.len() { + 1 => match prefix.get(0) { + Some(s) => self.starts_with(s), + None => BooleanChunked::full_null(ca.name().clone(), ca.len()), + }, + _ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)), + } + } + /// Get the length of the string values as number of chars. fn str_len_chars(&self) -> UInt32Chunked { let ca = self.as_string(); diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 039c995557be..48fd35725fde 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -502,9 +502,8 @@ pub(super) fn ends_with(s: &[Column]) -> PolarsResult { } pub(super) fn starts_with(s: &[Column]) -> PolarsResult { - let ca = &s[0].str()?.as_binary(); - let prefix = &s[1].str()?.as_binary(); - + let ca = s[0].str()?; + let prefix = s[1].str()?; Ok(ca.starts_with_chunked(prefix).into_column()) } From 1750a0f51c175e8230a1507dc9129d59c0716690 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 10:38:36 +0100 Subject: [PATCH 02/10] Format --- crates/polars-arrow/src/array/binview/view.rs | 12 +++++++++--- .../src/chunked_array/strings/namespace.rs | 9 ++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index b8abb663b648..eeb6dd43d157 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -141,11 +141,17 @@ impl View { pub fn starts_with<'a>(&self, prefix: &str, buffers: &'a [Buffer]) -> bool { unsafe { if self.length <= View::MAX_INLINE_SIZE { - self.get_inlined_slice_unchecked().starts_with(prefix.as_bytes()) + self.get_inlined_slice_unchecked() + .starts_with(prefix.as_bytes()) } else { - let starts = self.prefix.to_le_bytes().starts_with(&prefix.as_bytes()[0..4]); + let starts = self + .prefix + .to_le_bytes() + .starts_with(&prefix.as_bytes()[0..4]); if starts { - return self.get_slice_unchecked(buffers).starts_with(prefix.as_bytes()); + return self + .get_slice_unchecked(buffers) + .starts_with(prefix.as_bytes()); } false } diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 5838dcb9f7ac..059d9b426ec3 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -10,6 +10,7 @@ use polars_core::export::regex::Regex; use polars_core::prelude::arity::*; use polars_utils::cache::FastFixedCache; use regex::escape; + use super::*; #[cfg(feature = "binary_encoding")] use crate::chunked_array::binary::BinaryNameSpaceImpl; @@ -221,9 +222,11 @@ pub trait StringNameSpaceImpl: AsString { unsafe { let iter = ca.downcast_iter().map(|arr| { - let out: ::Array = arr.views().iter().map(|view| { - view.starts_with(sub, arr.data_buffers()) - }).collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); + let out: ::Array = arr + .views() + .iter() + .map(|view| view.starts_with(sub, arr.data_buffers())) + .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); out.with_validity_typed(arr.validity().cloned()) }); From 9d7db214c3b4e7fe57fa065bfc4f34b2214b74f4 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 10:46:23 +0100 Subject: [PATCH 03/10] Format --- crates/polars-arrow/src/array/binview/view.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index eeb6dd43d157..4a3f9d01b2f6 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -138,7 +138,7 @@ impl View { /// Checks if the string starts with the prefix /// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast - pub fn starts_with<'a>(&self, prefix: &str, buffers: &'a [Buffer]) -> bool { + pub fn starts_with(&self, prefix: &str, buffers: &[Buffer]) -> bool { unsafe { if self.length <= View::MAX_INLINE_SIZE { self.get_inlined_slice_unchecked() From c9caae27a41afd0845d98d7fd6b45586593cc6d2 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 10:49:32 +0100 Subject: [PATCH 04/10] Remove unsafe block --- .../src/chunked_array/strings/namespace.rs | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 059d9b426ec3..3fa0436e13c2 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -220,18 +220,16 @@ pub trait StringNameSpaceImpl: AsString { fn starts_with(&self, sub: &str) -> BooleanChunked { let ca = self.as_string(); - unsafe { - let iter = ca.downcast_iter().map(|arr| { - let out: ::Array = arr - .views() - .iter() - .map(|view| view.starts_with(sub, arr.data_buffers())) - .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); - out.with_validity_typed(arr.validity().cloned()) - }); - - ChunkedArray::from_chunk_iter(ca.name().clone(), iter) - } + let iter = ca.downcast_iter().map(|arr| { + let out: ::Array = arr + .views() + .iter() + .map(|view| view.starts_with(sub, arr.data_buffers())) + .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); + out.with_validity_typed(arr.validity().cloned()) + }); + + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) } /// This is more performant than the BinaryChunked version because we use the inline prefix From 1c45a895a258b176cd97e5e7c2c13eba0b27e001 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 11:29:26 +0100 Subject: [PATCH 05/10] Resolve feedback --- crates/polars-arrow/src/array/binview/view.rs | 22 ---------------- .../src/chunked_array/strings/mod.rs | 2 ++ .../src/chunked_array/strings/namespace.rs | 1 + .../src/chunked_array/strings/starts_with.rs | 25 +++++++++++++++++++ 4 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 crates/polars-ops/src/chunked_array/strings/starts_with.rs diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index 4a3f9d01b2f6..70285aacead5 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -136,28 +136,6 @@ impl View { } } - /// Checks if the string starts with the prefix - /// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast - pub fn starts_with(&self, prefix: &str, buffers: &[Buffer]) -> bool { - unsafe { - if self.length <= View::MAX_INLINE_SIZE { - self.get_inlined_slice_unchecked() - .starts_with(prefix.as_bytes()) - } else { - let starts = self - .prefix - .to_le_bytes() - .starts_with(&prefix.as_bytes()[0..4]); - if starts { - return self - .get_slice_unchecked(buffers) - .starts_with(prefix.as_bytes()); - } - false - } - } - } - /// Constructs a byteslice from this view. /// /// # Safety diff --git a/crates/polars-ops/src/chunked_array/strings/mod.rs b/crates/polars-ops/src/chunked_array/strings/mod.rs index 326349c36815..f7a7e080ffcb 100644 --- a/crates/polars-ops/src/chunked_array/strings/mod.rs +++ b/crates/polars-ops/src/chunked_array/strings/mod.rs @@ -19,6 +19,8 @@ mod reverse; #[cfg(feature = "strings")] mod split; #[cfg(feature = "strings")] +mod starts_with; +#[cfg(feature = "strings")] mod strip; #[cfg(feature = "strings")] mod substring; diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 3fa0436e13c2..83b9cb977cba 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -9,6 +9,7 @@ use polars_core::export::num::Num; use polars_core::export::regex::Regex; use polars_core::prelude::arity::*; use polars_utils::cache::FastFixedCache; +use polars_utils::slice::SliceAble; use regex::escape; use super::*; diff --git a/crates/polars-ops/src/chunked_array/strings/starts_with.rs b/crates/polars-ops/src/chunked_array/strings/starts_with.rs new file mode 100644 index 000000000000..4251f48b8511 --- /dev/null +++ b/crates/polars-ops/src/chunked_array/strings/starts_with.rs @@ -0,0 +1,25 @@ +use arrow::array::View; +use arrow::buffer::Buffer; +use polars_utils::slice::SliceAble; + +/// Checks if the string starts with the prefix +/// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast +pub fn starts_with(view: View, prefix: &str, buffers: &[Buffer]) -> bool { + unsafe { + if view.length <= View::MAX_INLINE_SIZE { + view.get_inlined_slice_unchecked() + .starts_with(prefix.as_bytes()) + } else { + let starts = view + .prefix + .to_le_bytes() + .starts_with(&prefix.as_bytes().slice_unchecked(0..4)); + if starts { + return view + .get_slice_unchecked(buffers) + .starts_with(prefix.as_bytes()); + } + false + } + } +} From 642b93ad895221113497d104a8a7b5ebbe4346e8 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 11:33:15 +0100 Subject: [PATCH 06/10] Fix fmt --- crates/polars-ops/src/chunked_array/strings/namespace.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 83b9cb977cba..fcfc56a2e5ec 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -9,12 +9,12 @@ use polars_core::export::num::Num; use polars_core::export::regex::Regex; use polars_core::prelude::arity::*; use polars_utils::cache::FastFixedCache; -use polars_utils::slice::SliceAble; use regex::escape; use super::*; #[cfg(feature = "binary_encoding")] use crate::chunked_array::binary::BinaryNameSpaceImpl; +use crate::prelude::strings::starts_with::starts_with; // We need this to infer the right lifetimes for the match closure. #[inline(always)] @@ -225,7 +225,7 @@ pub trait StringNameSpaceImpl: AsString { let out: ::Array = arr .views() .iter() - .map(|view| view.starts_with(sub, arr.data_buffers())) + .map(|view| starts_with(*view, sub, arr.data_buffers())) .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); out.with_validity_typed(arr.validity().cloned()) }); From c9a4f79317037f7325ad4dd3a16c99ebe1ad6850 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 12:42:41 +0100 Subject: [PATCH 07/10] Lint --- crates/polars-ops/src/chunked_array/strings/starts_with.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-ops/src/chunked_array/strings/starts_with.rs b/crates/polars-ops/src/chunked_array/strings/starts_with.rs index 4251f48b8511..14c9690fc16c 100644 --- a/crates/polars-ops/src/chunked_array/strings/starts_with.rs +++ b/crates/polars-ops/src/chunked_array/strings/starts_with.rs @@ -13,7 +13,7 @@ pub fn starts_with(view: View, prefix: &str, buffers: &[Buffer]) -> bool { let starts = view .prefix .to_le_bytes() - .starts_with(&prefix.as_bytes().slice_unchecked(0..4)); + .starts_with(prefix.as_bytes().slice_unchecked(0..4)); if starts { return view .get_slice_unchecked(buffers) From ad22190f8a00302ddd80f3aafc3a23e783408f5f Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Thu, 21 Nov 2024 14:24:10 +0100 Subject: [PATCH 08/10] Specializations --- .../src/chunked_array/strings/namespace.rs | 55 ++++++++++++++++--- .../src/chunked_array/strings/starts_with.rs | 24 +++++++- 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index fcfc56a2e5ec..0487799fcc8d 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -1,4 +1,5 @@ use arrow::array::{Array, ValueSize}; +use arrow::compute::utils::combine_validities_and; use arrow::legacy::kernels::string::*; #[cfg(feature = "string_encoding")] use base64::engine::general_purpose; @@ -8,13 +9,14 @@ use base64::Engine as _; use polars_core::export::num::Num; use polars_core::export::regex::Regex; use polars_core::prelude::arity::*; +use polars_core::utils::align_chunks_binary; use polars_utils::cache::FastFixedCache; use regex::escape; use super::*; #[cfg(feature = "binary_encoding")] use crate::chunked_array::binary::BinaryNameSpaceImpl; -use crate::prelude::strings::starts_with::starts_with; +use crate::prelude::strings::starts_with::{starts_with_str, starts_with_view}; // We need this to infer the right lifetimes for the match closure. #[inline(always)] @@ -221,12 +223,22 @@ pub trait StringNameSpaceImpl: AsString { fn starts_with(&self, sub: &str) -> BooleanChunked { let ca = self.as_string(); - let iter = ca.downcast_iter().map(|arr| { - let out: ::Array = arr - .views() - .iter() - .map(|view| starts_with(*view, sub, arr.data_buffers())) - .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); + let iter = ca.downcast_iter().map(|arr| unsafe { + // If the buffer is empty then all strings are inlined and we can avoid a branch which might result in vectorization + let out: ::Array = if arr.data_buffers().is_empty() { + arr.views() + .iter() + .map(|view| { + view.get_inlined_slice_unchecked() + .starts_with(sub.as_bytes()) + }) + .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())) + } else { + arr.views() + .iter() + .map(|view| starts_with_str(*view, sub, arr.data_buffers())) + .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())) + }; out.with_validity_typed(arr.validity().cloned()) }); @@ -242,7 +254,34 @@ pub trait StringNameSpaceImpl: AsString { Some(s) => self.starts_with(s), None => BooleanChunked::full_null(ca.name().clone(), ca.len()), }, - _ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)), + _ => { + let (lhs, rhs) = align_chunks_binary(ca, prefix); + + let iter = + lhs.downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(lhs_arr, rhs_arr)| { + let validity = + combine_validities_and(lhs_arr.validity(), rhs_arr.validity()); + + let element_iter = + lhs_arr.views().iter().zip(rhs_arr.views().iter()).map( + |(lhs_val, rhs_val)| { + starts_with_view( + *lhs_val, + *rhs_val, + lhs_arr.data_buffers(), + rhs_arr.data_buffers(), + ) + }, + ); + + let array: ::Array = + element_iter.collect_arr(); + array.with_validity_typed(validity) + }); + ChunkedArray::from_chunk_iter(lhs.name().clone(), iter) + }, } } diff --git a/crates/polars-ops/src/chunked_array/strings/starts_with.rs b/crates/polars-ops/src/chunked_array/strings/starts_with.rs index 14c9690fc16c..116b9f3c4656 100644 --- a/crates/polars-ops/src/chunked_array/strings/starts_with.rs +++ b/crates/polars-ops/src/chunked_array/strings/starts_with.rs @@ -4,7 +4,7 @@ use polars_utils::slice::SliceAble; /// Checks if the string starts with the prefix /// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast -pub fn starts_with(view: View, prefix: &str, buffers: &[Buffer]) -> bool { +pub(crate) fn starts_with_str(view: View, prefix: &str, buffers: &[Buffer]) -> bool { unsafe { if view.length <= View::MAX_INLINE_SIZE { view.get_inlined_slice_unchecked() @@ -23,3 +23,25 @@ pub fn starts_with(view: View, prefix: &str, buffers: &[Buffer]) -> bool { } } } + +/// Checks if the string starts with the prefix +/// If you call this in a loop and the prefix doesn't change then prefer starts_with_str() +pub(crate) fn starts_with_view( + view: View, + prefix: View, + left_buffers: &[Buffer], + right_buffers: &[Buffer], +) -> bool { + unsafe { + if !view.prefix.to_le_bytes()[0..view.length.min(4) as usize] + .starts_with(&prefix.prefix.to_le_bytes()[..view.length.min(4) as usize]) + { + return false; + } + + let left_buffer = view.get_slice_unchecked(left_buffers); + let right_buffer = prefix.get_slice_unchecked(right_buffers); + + left_buffer.starts_with(right_buffer) + } +} From 522926617c439a4bb5066d6511db7ecf03535c75 Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Fri, 22 Nov 2024 12:59:59 +0100 Subject: [PATCH 09/10] Use inline or buffer version --- .../src/chunked_array/strings/namespace.rs | 55 +++---------------- .../src/chunked_array/strings/starts_with.rs | 22 -------- 2 files changed, 8 insertions(+), 69 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 0487799fcc8d..f54acad433b6 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -1,5 +1,4 @@ use arrow::array::{Array, ValueSize}; -use arrow::compute::utils::combine_validities_and; use arrow::legacy::kernels::string::*; #[cfg(feature = "string_encoding")] use base64::engine::general_purpose; @@ -9,14 +8,13 @@ use base64::Engine as _; use polars_core::export::num::Num; use polars_core::export::regex::Regex; use polars_core::prelude::arity::*; -use polars_core::utils::align_chunks_binary; use polars_utils::cache::FastFixedCache; use regex::escape; use super::*; #[cfg(feature = "binary_encoding")] use crate::chunked_array::binary::BinaryNameSpaceImpl; -use crate::prelude::strings::starts_with::{starts_with_str, starts_with_view}; +use crate::prelude::strings::starts_with::starts_with_str; // We need this to infer the right lifetimes for the match closure. #[inline(always)] @@ -222,23 +220,13 @@ pub trait StringNameSpaceImpl: AsString { /// Check if strings starts with a substring fn starts_with(&self, sub: &str) -> BooleanChunked { let ca = self.as_string(); - let iter = ca.downcast_iter().map(|arr| unsafe { - // If the buffer is empty then all strings are inlined and we can avoid a branch which might result in vectorization - let out: ::Array = if arr.data_buffers().is_empty() { - arr.views() - .iter() - .map(|view| { - view.get_inlined_slice_unchecked() - .starts_with(sub.as_bytes()) - }) - .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())) - } else { - arr.views() - .iter() - .map(|view| starts_with_str(*view, sub, arr.data_buffers())) - .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())) - }; + let out: ::Array = arr + .views() + .iter() + .map(|view| starts_with_str(*view, sub, arr.data_buffers())) + .collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest())); + out.with_validity_typed(arr.validity().cloned()) }); @@ -254,34 +242,7 @@ pub trait StringNameSpaceImpl: AsString { Some(s) => self.starts_with(s), None => BooleanChunked::full_null(ca.name().clone(), ca.len()), }, - _ => { - let (lhs, rhs) = align_chunks_binary(ca, prefix); - - let iter = - lhs.downcast_iter() - .zip(rhs.downcast_iter()) - .map(|(lhs_arr, rhs_arr)| { - let validity = - combine_validities_and(lhs_arr.validity(), rhs_arr.validity()); - - let element_iter = - lhs_arr.views().iter().zip(rhs_arr.views().iter()).map( - |(lhs_val, rhs_val)| { - starts_with_view( - *lhs_val, - *rhs_val, - lhs_arr.data_buffers(), - rhs_arr.data_buffers(), - ) - }, - ); - - let array: ::Array = - element_iter.collect_arr(); - array.with_validity_typed(validity) - }); - ChunkedArray::from_chunk_iter(lhs.name().clone(), iter) - }, + _ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)), } } diff --git a/crates/polars-ops/src/chunked_array/strings/starts_with.rs b/crates/polars-ops/src/chunked_array/strings/starts_with.rs index 116b9f3c4656..91096931125b 100644 --- a/crates/polars-ops/src/chunked_array/strings/starts_with.rs +++ b/crates/polars-ops/src/chunked_array/strings/starts_with.rs @@ -23,25 +23,3 @@ pub(crate) fn starts_with_str(view: View, prefix: &str, buffers: &[Buffer]) } } } - -/// Checks if the string starts with the prefix -/// If you call this in a loop and the prefix doesn't change then prefer starts_with_str() -pub(crate) fn starts_with_view( - view: View, - prefix: View, - left_buffers: &[Buffer], - right_buffers: &[Buffer], -) -> bool { - unsafe { - if !view.prefix.to_le_bytes()[0..view.length.min(4) as usize] - .starts_with(&prefix.prefix.to_le_bytes()[..view.length.min(4) as usize]) - { - return false; - } - - let left_buffer = view.get_slice_unchecked(left_buffers); - let right_buffer = prefix.get_slice_unchecked(right_buffers); - - left_buffer.starts_with(right_buffer) - } -} From 023ea87df5f851d870050df672faf4883a2c184e Mon Sep 17 00:00:00 2001 From: Stijn Herfst Date: Fri, 22 Nov 2024 13:06:55 +0100 Subject: [PATCH 10/10] Remove unsafe --- crates/polars-ops/src/chunked_array/strings/namespace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index f54acad433b6..e2ab82ac3aca 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -220,7 +220,7 @@ pub trait StringNameSpaceImpl: AsString { /// Check if strings starts with a substring fn starts_with(&self, sub: &str) -> BooleanChunked { let ca = self.as_string(); - let iter = ca.downcast_iter().map(|arr| unsafe { + let iter = ca.downcast_iter().map(|arr| { let out: ::Array = arr .views() .iter()