diff --git a/Cargo.lock b/Cargo.lock index e332fe6e1a87..9fce5349d361 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3053,7 +3053,6 @@ dependencies = [ "regex", "serde", "serde_json", - "smartstring", "thiserror", "version_check", "xxhash-rust", @@ -3101,7 +3100,6 @@ dependencies = [ "polars-time", "polars-utils", "rayon", - "smartstring", ] [[package]] @@ -3152,7 +3150,6 @@ dependencies = [ "serde_json", "simd-json", "simdutf8", - "smartstring", "tempfile", "tokio", "tokio-util", @@ -3204,7 +3201,6 @@ dependencies = [ "pyo3", "rayon", "serde_json", - "smartstring", "tokio", "version_check", ] @@ -3260,7 +3256,6 @@ dependencies = [ "regex", "serde", "serde_json", - "smartstring", "unicode-reverse", "version_check", ] @@ -3315,7 +3310,6 @@ dependencies = [ "polars-row", "polars-utils", "rayon", - "smartstring", "tokio", "uuid", "version_check", @@ -3353,7 +3347,6 @@ dependencies = [ "regex", "serde", "serde_json", - "smartstring", "strum_macros 0.26.4", "version_check", ] @@ -3387,7 +3380,6 @@ dependencies = [ "pyo3", "recursive", "serde_json", - "smartstring", "thiserror", "version_check", ] @@ -3415,6 +3407,7 @@ dependencies = [ "polars-ops", "polars-plan", "polars-time", + "polars-utils", "rand", "serde", "serde_json", @@ -3465,7 +3458,6 @@ dependencies = [ "polars-utils", "regex", "serde", - "smartstring", ] [[package]] @@ -3486,7 +3478,6 @@ dependencies = [ "raw-cpuid", "rayon", "serde", - "smartstring", "stacker", "sysinfo", "version_check", @@ -4423,18 +4414,6 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" -[[package]] -name = "smartstring" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" -dependencies = [ - "autocfg", - "serde", - "static_assertions", - "version_check", -] - [[package]] name = "snafu" version = "0.7.5" diff --git a/Cargo.toml b/Cargo.toml index 44ba246bccae..4650ca1d5b9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,12 +75,11 @@ recursive = "0.1" regex = "1.9" reqwest = { version = "0.12", default-features = false } ryu = "1.0.13" -serde = { version = "1.0.188", features = ["derive"] } +serde = { version = "1.0.188", features = ["derive", "rc"] } serde_json = "1" simd-json = { version = "0.13", features = ["known-key"] } simdutf8 = "0.1.4" slotmap = "1" -smartstring = "1" sqlparser = "0.49" stacker = "0.1" streaming-iterator = "0.1.9" diff --git a/README.md b/README.md index 5d7e3c3bf203..5c1dfc76c871 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Refer to the [Polars CLI repository](https://github.com/pola-rs/polars-cli) for ### Blazingly fast -Polars is very fast. In fact, it is one of the best performing solutions available. See the [TPC-H benchmarks](https://www.pola.rs/benchmarks.html) results. +Polars is very fast. In fact, it is one of the best performing solutions available. See the [PDS-H benchmarks](https://www.pola.rs/benchmarks.html) results. ### Lightweight @@ -247,9 +247,9 @@ can `pip install polars` and `import polars`. ## Using custom Rust functions in Python Extending Polars with UDFs compiled in Rust is easy. We expose PyO3 extensions for `DataFrame` and `Series` -data structures. See more in https://github.com/pola-rs/pyo3-polars. +data structures. See more in . -## Going big... +## Going big Do you expect more than 2^32 (~4.2 billion) rows? Compile Polars with the `bigidx` feature flag or, for Python users, install `pip install polars-u64-idx`. diff --git a/crates/polars-arrow/src/array/fixed_size_list/mod.rs b/crates/polars-arrow/src/array/fixed_size_list/mod.rs index 7e512cba5203..4fd817b90264 100644 --- a/crates/polars-arrow/src/array/fixed_size_list/mod.rs +++ b/crates/polars-arrow/src/array/fixed_size_list/mod.rs @@ -11,6 +11,7 @@ mod iterator; mod mutable; pub use mutable::*; use polars_error::{polars_bail, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; /// The Arrow's equivalent to an immutable `Vec>` where `T` is an Arrow type. /// Cloning and slicing this struct is `O(1)`. @@ -199,7 +200,7 @@ impl FixedSizeListArray { /// Returns a [`ArrowDataType`] consistent with [`FixedSizeListArray`]. pub fn default_datatype(data_type: ArrowDataType, size: usize) -> ArrowDataType { - let field = Box::new(Field::new("item", data_type, true)); + let field = Box::new(Field::new(PlSmallStr::from_static("item"), data_type, true)); ArrowDataType::FixedSizeList(field, size) } } diff --git a/crates/polars-arrow/src/array/fixed_size_list/mutable.rs b/crates/polars-arrow/src/array/fixed_size_list/mutable.rs index ddd03b9ea099..9b05396565f7 100644 --- a/crates/polars-arrow/src/array/fixed_size_list/mutable.rs +++ b/crates/polars-arrow/src/array/fixed_size_list/mutable.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use polars_error::{polars_bail, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use super::FixedSizeListArray; use crate::array::physical_binary::extend_validity; @@ -35,7 +36,7 @@ impl MutableFixedSizeListArray { } /// Creates a new [`MutableFixedSizeListArray`] from a [`MutableArray`] and size. - pub fn new_with_field(values: M, name: &str, nullable: bool, size: usize) -> Self { + pub fn new_with_field(values: M, name: PlSmallStr, nullable: bool, size: usize) -> Self { let data_type = ArrowDataType::FixedSizeList( Box::new(Field::new(name, values.data_type().clone(), nullable)), size, diff --git a/crates/polars-arrow/src/array/list/mod.rs b/crates/polars-arrow/src/array/list/mod.rs index 27c20b72d0ea..17e6aff369a1 100644 --- a/crates/polars-arrow/src/array/list/mod.rs +++ b/crates/polars-arrow/src/array/list/mod.rs @@ -13,6 +13,7 @@ pub use iterator::*; mod mutable; pub use mutable::*; use polars_error::{polars_bail, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; /// An [`Array`] semantically equivalent to `Vec>>>` with Arrow's in-memory. #[derive(Clone)] @@ -185,7 +186,7 @@ impl ListArray { impl ListArray { /// Returns a default [`ArrowDataType`]: inner field is named "item" and is nullable pub fn default_datatype(data_type: ArrowDataType) -> ArrowDataType { - let field = Box::new(Field::new("item", data_type, true)); + let field = Box::new(Field::new(PlSmallStr::from_static("item"), data_type, true)); if O::IS_LARGE { ArrowDataType::LargeList(field) } else { diff --git a/crates/polars-arrow/src/array/list/mutable.rs b/crates/polars-arrow/src/array/list/mutable.rs index 3fd528019063..7a1601e4ceaa 100644 --- a/crates/polars-arrow/src/array/list/mutable.rs +++ b/crates/polars-arrow/src/array/list/mutable.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use polars_error::{polars_err, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use super::ListArray; use crate::array::physical_binary::extend_validity; @@ -122,7 +123,7 @@ impl MutableListArray { } /// Creates a new [`MutableListArray`] from a [`MutableArray`]. - pub fn new_with_field(values: M, name: &str, nullable: bool) -> Self { + pub fn new_with_field(values: M, name: PlSmallStr, nullable: bool) -> Self { let field = Box::new(Field::new(name, values.data_type().clone(), nullable)); let data_type = if O::IS_LARGE { ArrowDataType::LargeList(field) diff --git a/crates/polars-arrow/src/array/primitive/fmt.rs b/crates/polars-arrow/src/array/primitive/fmt.rs index 1b3c5776b180..35f342aec3fa 100644 --- a/crates/polars-arrow/src/array/primitive/fmt.rs +++ b/crates/polars-arrow/src/array/primitive/fmt.rs @@ -56,7 +56,7 @@ pub fn get_write_value<'a, T: NativeType, F: Write>( Time64(_) => unreachable!(), // remaining are not valid Timestamp(time_unit, tz) => { if let Some(tz) = tz { - let timezone = temporal_conversions::parse_offset(tz); + let timezone = temporal_conversions::parse_offset(tz.as_str()); match timezone { Ok(timezone) => { dyn_primitive!(array, i64, |time| { @@ -65,7 +65,7 @@ pub fn get_write_value<'a, T: NativeType, F: Write>( }, #[cfg(feature = "chrono-tz")] Err(_) => { - let timezone = temporal_conversions::parse_offset_tz(tz); + let timezone = temporal_conversions::parse_offset_tz(tz.as_str()); match timezone { Ok(timezone) => dyn_primitive!(array, i64, |time| { temporal_conversions::timestamp_to_datetime( diff --git a/crates/polars-arrow/src/array/struct_/mod.rs b/crates/polars-arrow/src/array/struct_/mod.rs index cecaea0d4364..6a31fe801209 100644 --- a/crates/polars-arrow/src/array/struct_/mod.rs +++ b/crates/polars-arrow/src/array/struct_/mod.rs @@ -23,8 +23,8 @@ use crate::compute::utils::combine_validities_and; /// let int = Int32Array::from_slice(&[42, 28, 19, 31]).boxed(); /// /// let fields = vec![ -/// Field::new("b", ArrowDataType::Boolean, false), -/// Field::new("c", ArrowDataType::Int32, false), +/// Field::new("b".into(), ArrowDataType::Boolean, false), +/// Field::new("c".into(), ArrowDataType::Int32, false), /// ]; /// /// let array = StructArray::new(ArrowDataType::Struct(fields), vec![boolean, int], None); diff --git a/crates/polars-arrow/src/bitmap/bitmap_ops.rs b/crates/polars-arrow/src/bitmap/bitmap_ops.rs index 9e5ac502e6b5..a3edb658be4e 100644 --- a/crates/polars-arrow/src/bitmap/bitmap_ops.rs +++ b/crates/polars-arrow/src/bitmap/bitmap_ops.rs @@ -300,6 +300,22 @@ pub fn intersects_with_mut(lhs: &MutableBitmap, rhs: &MutableBitmap) -> bool { ) } +pub fn num_edges(lhs: &Bitmap) -> usize { + if lhs.is_empty() { + return 0; + } + + // @TODO: If is probably quite inefficient to do it like this because now either one is not + // aligned. Maybe, we can implement a smarter way to do this. + binary_fold( + &unsafe { lhs.clone().sliced_unchecked(0, lhs.len() - 1) }, + &unsafe { lhs.clone().sliced_unchecked(1, lhs.len() - 1) }, + |l, r| (l ^ r).count_ones() as usize, + 0, + |acc, v| acc + v, + ) +} + /// Compute `out[i] = if selector[i] { truthy[i] } else { falsy }`. pub fn select_constant(selector: &Bitmap, truthy: &Bitmap, falsy: bool) -> Bitmap { let falsy_mask: u64 = if falsy { diff --git a/crates/polars-arrow/src/bitmap/immutable.rs b/crates/polars-arrow/src/bitmap/immutable.rs index 4b52045afa9f..6ad76a07b639 100644 --- a/crates/polars-arrow/src/bitmap/immutable.rs +++ b/crates/polars-arrow/src/bitmap/immutable.rs @@ -555,6 +555,11 @@ impl Bitmap { pub fn select_constant(&self, truthy: &Self, falsy: bool) -> Self { super::bitmap_ops::select_constant(self, truthy, falsy) } + + /// Calculates the number of edges from `0 -> 1` and `1 -> 0`. + pub fn num_edges(&self) -> usize { + super::bitmap_ops::num_edges(self) + } } impl> From

for Bitmap { diff --git a/crates/polars-arrow/src/compute/cast/primitive_to.rs b/crates/polars-arrow/src/compute/cast/primitive_to.rs index 0ab353f16cbb..13fc8c8be3f0 100644 --- a/crates/polars-arrow/src/compute/cast/primitive_to.rs +++ b/crates/polars-arrow/src/compute/cast/primitive_to.rs @@ -2,6 +2,7 @@ use std::hash::Hash; use num_traits::{AsPrimitive, Float, ToPrimitive}; use polars_error::PolarsResult; +use polars_utils::pl_str::PlSmallStr; use super::CastOptionsImpl; use crate::array::*; @@ -434,7 +435,7 @@ pub fn timestamp_to_timestamp( from: &PrimitiveArray, from_unit: TimeUnit, to_unit: TimeUnit, - tz: &Option, + tz: &Option, ) -> PrimitiveArray { let from_size = time_unit_multiple(from_unit); let to_size = time_unit_multiple(to_unit); diff --git a/crates/polars-arrow/src/compute/temporal.rs b/crates/polars-arrow/src/compute/temporal.rs index 437089b72891..6bc76aa0f9a3 100644 --- a/crates/polars-arrow/src/compute/temporal.rs +++ b/crates/polars-arrow/src/compute/temporal.rs @@ -59,12 +59,12 @@ macro_rules! date_like { ArrowDataType::Timestamp(time_unit, Some(timezone_str)) => { let array = $array.as_any().downcast_ref().unwrap(); - if let Ok(timezone) = parse_offset(timezone_str) { + if let Ok(timezone) = parse_offset(timezone_str.as_str()) { Ok(extract_impl(array, *time_unit, timezone, |x| { x.$extract().try_into().unwrap() })) } else { - chrono_tz(array, *time_unit, timezone_str, |x| { + chrono_tz(array, *time_unit, timezone_str.as_str(), |x| { x.$extract().try_into().unwrap() }) } @@ -129,12 +129,12 @@ macro_rules! time_like { ArrowDataType::Timestamp(time_unit, Some(timezone_str)) => { let array = $array.as_any().downcast_ref().unwrap(); - if let Ok(timezone) = parse_offset(timezone_str) { + if let Ok(timezone) = parse_offset(timezone_str.as_str()) { Ok(extract_impl(array, *time_unit, timezone, |x| { x.$extract().try_into().unwrap() })) } else { - chrono_tz(array, *time_unit, timezone_str, |x| { + chrono_tz(array, *time_unit, timezone_str.as_str(), |x| { x.$extract().try_into().unwrap() }) } diff --git a/crates/polars-arrow/src/datatypes/field.rs b/crates/polars-arrow/src/datatypes/field.rs index 950f081017c4..f0548ae2b3ce 100644 --- a/crates/polars-arrow/src/datatypes/field.rs +++ b/crates/polars-arrow/src/datatypes/field.rs @@ -1,3 +1,4 @@ +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -15,7 +16,7 @@ use super::{ArrowDataType, Metadata}; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Field { /// Its name - pub name: String, + pub name: PlSmallStr, /// Its logical [`ArrowDataType`] pub data_type: ArrowDataType, /// Its nullability @@ -26,9 +27,9 @@ pub struct Field { impl Field { /// Creates a new [`Field`]. - pub fn new>(name: T, data_type: ArrowDataType, is_nullable: bool) -> Self { + pub fn new(name: PlSmallStr, data_type: ArrowDataType, is_nullable: bool) -> Self { Field { - name: name.into(), + name, data_type, is_nullable, metadata: Default::default(), @@ -56,8 +57,18 @@ impl Field { #[cfg(feature = "arrow_rs")] impl From for arrow_schema::Field { fn from(value: Field) -> Self { - Self::new(value.name, value.data_type.into(), value.is_nullable) - .with_metadata(value.metadata.into_iter().collect()) + Self::new( + value.name.to_string(), + value.data_type.into(), + value.is_nullable, + ) + .with_metadata( + value + .metadata + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + ) } } @@ -75,9 +86,14 @@ impl From<&arrow_schema::Field> for Field { let metadata = value .metadata() .iter() - .map(|(k, v)| (k.clone(), v.clone())) + .map(|(k, v)| (PlSmallStr::from_str(k), PlSmallStr::from_str(v))) .collect(); - Self::new(value.name(), data_type, value.is_nullable()).with_metadata(metadata) + Self::new( + PlSmallStr::from_str(value.name().as_str()), + data_type, + value.is_nullable(), + ) + .with_metadata(metadata) } } diff --git a/crates/polars-arrow/src/datatypes/mod.rs b/crates/polars-arrow/src/datatypes/mod.rs index c232c985d8a0..85a2df5ada54 100644 --- a/crates/polars-arrow/src/datatypes/mod.rs +++ b/crates/polars-arrow/src/datatypes/mod.rs @@ -9,14 +9,15 @@ use std::sync::Arc; pub use field::Field; pub use physical_type::*; +use polars_utils::pl_str::PlSmallStr; pub use schema::{ArrowSchema, ArrowSchemaRef}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -/// typedef for [BTreeMap] denoting [`Field`]'s and [`ArrowSchema`]'s metadata. -pub type Metadata = BTreeMap; -/// typedef for [Option<(String, Option)>] descr -pub(crate) type Extension = Option<(String, Option)>; +/// typedef for [BTreeMap] denoting [`Field`]'s and [`ArrowSchema`]'s metadata. +pub type Metadata = BTreeMap; +/// typedef for [Option<(PlSmallStr, Option)>] descr +pub(crate) type Extension = Option<(PlSmallStr, Option)>; /// The set of supported logical types in this crate. /// @@ -70,7 +71,7 @@ pub enum ArrowDataType { /// /// When the timezone is not specified, the timestamp is considered to have no timezone /// and is represented _as is_ - Timestamp(TimeUnit, Option), + Timestamp(TimeUnit, Option), /// An [`i32`] representing the elapsed time since UNIX epoch (1970-01-01) /// in days. Date32, @@ -163,7 +164,7 @@ pub enum ArrowDataType { /// - name /// - physical type /// - metadata - Extension(String, Box, Option), + Extension(PlSmallStr, Box, Option), /// A binary type that inlines small values /// and can intern bytes. BinaryView, @@ -193,7 +194,9 @@ impl From for arrow_schema::DataType { ArrowDataType::Float16 => Self::Float16, ArrowDataType::Float32 => Self::Float32, ArrowDataType::Float64 => Self::Float64, - ArrowDataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz.map(Into::into)), + ArrowDataType::Timestamp(unit, tz) => { + Self::Timestamp(unit.into(), tz.map(|x| Arc::::from(x.as_str()))) + }, ArrowDataType::Date32 => Self::Date32, ArrowDataType::Date64 => Self::Date64, ArrowDataType::Time32(unit) => Self::Time32(unit.into()), @@ -260,7 +263,7 @@ impl From for ArrowDataType { DataType::Float32 => Self::Float32, DataType::Float64 => Self::Float64, DataType::Timestamp(unit, tz) => { - Self::Timestamp(unit.into(), tz.map(|x| x.to_string())) + Self::Timestamp(unit.into(), tz.map(|x| PlSmallStr::from_str(x.as_ref()))) }, DataType::Date32 => Self::Date32, DataType::Date64 => Self::Date64, @@ -545,6 +548,22 @@ impl ArrowDataType { } } + pub fn is_nested(&self) -> bool { + use ArrowDataType as D; + + matches!( + self, + D::List(_) + | D::LargeList(_) + | D::FixedSizeList(_, _) + | D::Struct(_) + | D::Union(_, _, _) + | D::Map(_, _) + | D::Dictionary(_, _, _) + | D::Extension(_, _, _) + ) + } + pub fn is_view(&self) -> bool { matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView) } @@ -593,8 +612,10 @@ pub type SchemaRef = Arc; /// support get extension for metadata pub fn get_extension(metadata: &Metadata) -> Extension { - if let Some(name) = metadata.get("ARROW:extension:name") { - let metadata = metadata.get("ARROW:extension:metadata").cloned(); + if let Some(name) = metadata.get(&PlSmallStr::from_static("ARROW:extension:name")) { + let metadata = metadata + .get(&PlSmallStr::from_static("ARROW:extension:metadata")) + .cloned(); Some((name.clone(), metadata)) } else { None diff --git a/crates/polars-arrow/src/ffi/schema.rs b/crates/polars-arrow/src/ffi/schema.rs index f958311d7988..0038e7941445 100644 --- a/crates/polars-arrow/src/ffi/schema.rs +++ b/crates/polars-arrow/src/ffi/schema.rs @@ -3,6 +3,7 @@ use std::ffi::{CStr, CString}; use std::ptr; use polars_error::{polars_bail, polars_err, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use super::ArrowSchema; use crate::datatypes::{ @@ -74,7 +75,7 @@ impl ArrowSchema { { flags += *is_ordered as i64; // we do not store field info in the dict values, so can't recover it all :( - let field = Field::new("", values.as_ref().clone(), true); + let field = Field::new(PlSmallStr::const_default(), values.as_ref().clone(), true); Some(Box::new(ArrowSchema::new(&field))) } else { None @@ -90,12 +91,15 @@ impl ArrowSchema { // metadata if let Some(extension_metadata) = extension_metadata { metadata.insert( - "ARROW:extension:metadata".to_string(), + PlSmallStr::from_static("ARROW:extension:metadata"), extension_metadata.clone(), ); } - metadata.insert("ARROW:extension:name".to_string(), name.clone()); + metadata.insert( + PlSmallStr::from_static("ARROW:extension:name"), + name.clone(), + ); Some(metadata_to_bytes(&metadata)) } else if !metadata.is_empty() { @@ -104,7 +108,7 @@ impl ArrowSchema { None }; - let name = CString::new(name).unwrap(); + let name = CString::new(name.as_str()).unwrap(); let format = CString::new(format).unwrap(); let mut private = Box::new(SchemaPrivateData { @@ -216,7 +220,12 @@ pub(crate) unsafe fn to_field(schema: &ArrowSchema) -> PolarsResult { data_type }; - Ok(Field::new(schema.name(), data_type, schema.nullable()).with_metadata(metadata)) + Ok(Field::new( + PlSmallStr::from_str(schema.name()), + data_type, + schema.nullable(), + ) + .with_metadata(metadata)) } fn to_integer_type(format: &str) -> PolarsResult { @@ -301,14 +310,18 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> PolarsResult { ["tsn", ""] => ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), // Timestamps with timezone - ["tss", tz] => ArrowDataType::Timestamp(TimeUnit::Second, Some(tz.to_string())), + ["tss", tz] => { + ArrowDataType::Timestamp(TimeUnit::Second, Some(PlSmallStr::from_str(tz))) + }, ["tsm", tz] => { - ArrowDataType::Timestamp(TimeUnit::Millisecond, Some(tz.to_string())) + ArrowDataType::Timestamp(TimeUnit::Millisecond, Some(PlSmallStr::from_str(tz))) }, ["tsu", tz] => { - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz.to_string())) + ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(PlSmallStr::from_str(tz))) + }, + ["tsn", tz] => { + ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some(PlSmallStr::from_str(tz))) }, - ["tsn", tz] => ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some(tz.to_string())), ["w", size_raw] => { // Example: "w:42" fixed-width binary [42 bytes] @@ -451,7 +464,7 @@ fn to_format(data_type: &ArrowDataType) -> String { format!( "ts{}:{}", unit, - tz.as_ref().map(|x| x.as_ref()).unwrap_or("") + tz.as_ref().map(|x| x.as_str()).unwrap_or("") ) }, ArrowDataType::Utf8View => "vu".to_string(), @@ -468,9 +481,9 @@ fn to_format(data_type: &ArrowDataType) -> String { let mut r = format!("+u{sparsness}:"); let ids = if let Some(ids) = ids { ids.iter() - .fold(String::new(), |a, b| a + &b.to_string() + ",") + .fold(String::new(), |a, b| a + b.to_string().as_str() + ",") } else { - (0..f.len()).fold(String::new(), |a, b| a + &b.to_string() + ",") + (0..f.len()).fold(String::new(), |a, b| a + b.to_string().as_str() + ",") }; let ids = &ids[..ids.len() - 1]; // take away last "," r.push_str(ids); @@ -498,7 +511,7 @@ pub(super) fn get_child(data_type: &ArrowDataType, index: usize) -> PolarsResult } } -fn metadata_to_bytes(metadata: &BTreeMap) -> Vec { +fn metadata_to_bytes(metadata: &BTreeMap) -> Vec { let a = (metadata.len() as i32).to_ne_bytes().to_vec(); metadata.iter().fold(a, |mut acc, (key, value)| { acc.extend((key.len() as i32).to_ne_bytes()); @@ -541,13 +554,13 @@ unsafe fn metadata_from_bytes(data: *const ::std::os::raw::c_char) -> (Metadata, data = data.add(value_len); match key { "ARROW:extension:name" => { - extension_name = Some(value.to_string()); + extension_name = Some(PlSmallStr::from_str(value)); }, "ARROW:extension:metadata" => { - extension_metadata = Some(value.to_string()); + extension_metadata = Some(PlSmallStr::from_str(value)); }, _ => { - result.insert(key.to_string(), value.to_string()); + result.insert(PlSmallStr::from_str(key), PlSmallStr::from_str(value)); }, }; } @@ -587,35 +600,50 @@ mod tests { ArrowDataType::LargeBinary, ArrowDataType::FixedSizeBinary(2), ArrowDataType::List(Box::new(Field::new( - "example", + PlSmallStr::from_static("example"), ArrowDataType::Boolean, false, ))), ArrowDataType::FixedSizeList( - Box::new(Field::new("example", ArrowDataType::Boolean, false)), + Box::new(Field::new( + PlSmallStr::from_static("example"), + ArrowDataType::Boolean, + false, + )), 2, ), ArrowDataType::LargeList(Box::new(Field::new( - "example", + PlSmallStr::from_static("example"), ArrowDataType::Boolean, false, ))), ArrowDataType::Struct(vec![ - Field::new("a", ArrowDataType::Int64, true), + Field::new(PlSmallStr::from_static("a"), ArrowDataType::Int64, true), Field::new( - "b", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))), + PlSmallStr::from_static("b"), + ArrowDataType::List(Box::new(Field::new( + PlSmallStr::from_static("item"), + ArrowDataType::Int32, + true, + ))), true, ), ]), - ArrowDataType::Map(Box::new(Field::new("a", ArrowDataType::Int64, true)), true), + ArrowDataType::Map( + Box::new(Field::new( + PlSmallStr::from_static("a"), + ArrowDataType::Int64, + true, + )), + true, + ), ArrowDataType::Union( vec![ - Field::new("a", ArrowDataType::Int64, true), + Field::new(PlSmallStr::from_static("a"), ArrowDataType::Int64, true), Field::new( - "b", + PlSmallStr::from_static("b"), ArrowDataType::List(Box::new(Field::new( - "item", + PlSmallStr::from_static("item"), ArrowDataType::Int32, true, ))), @@ -627,11 +655,11 @@ mod tests { ), ArrowDataType::Union( vec![ - Field::new("a", ArrowDataType::Int64, true), + Field::new(PlSmallStr::from_static("a"), ArrowDataType::Int64, true), Field::new( - "b", + PlSmallStr::from_static("b"), ArrowDataType::List(Box::new(Field::new( - "item", + PlSmallStr::from_static("item"), ArrowDataType::Int32, true, ))), @@ -651,7 +679,7 @@ mod tests { dts.push(ArrowDataType::Timestamp(time_unit, None)); dts.push(ArrowDataType::Timestamp( time_unit, - Some("00:00".to_string()), + Some(PlSmallStr::from_static("00:00")), )); dts.push(ArrowDataType::Duration(time_unit)); } @@ -664,7 +692,7 @@ mod tests { } for expected in dts { - let field = Field::new("a", expected.clone(), true); + let field = Field::new(PlSmallStr::from_static("a"), expected.clone(), true); let schema = ArrowSchema::new(&field); let result = unsafe { super::to_data_type(&schema).unwrap() }; assert_eq!(result, expected); diff --git a/crates/polars-arrow/src/io/avro/read/schema.rs b/crates/polars-arrow/src/io/avro/read/schema.rs index a29402ae600f..1538abbeddab 100644 --- a/crates/polars-arrow/src/io/avro/read/schema.rs +++ b/crates/polars-arrow/src/io/avro/read/schema.rs @@ -1,18 +1,22 @@ use avro_schema::schema::{Enum, Fixed, Record, Schema as AvroSchema}; use polars_error::{polars_bail, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use crate::datatypes::*; fn external_props(schema: &AvroSchema) -> Metadata { let mut props = Metadata::new(); - match &schema { + match schema { AvroSchema::Record(Record { doc: Some(ref doc), .. }) | AvroSchema::Enum(Enum { doc: Some(ref doc), .. }) => { - props.insert("avro::doc".to_string(), doc.clone()); + props.insert( + PlSmallStr::from_static("avro::doc"), + PlSmallStr::from_str(doc.as_str()), + ); }, _ => {}, } @@ -59,12 +63,14 @@ fn schema_to_field( avro_schema::schema::LongLogical::Time => { ArrowDataType::Time64(TimeUnit::Microsecond) }, - avro_schema::schema::LongLogical::TimestampMillis => { - ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("00:00".to_string())) - }, - avro_schema::schema::LongLogical::TimestampMicros => { - ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("00:00".to_string())) - }, + avro_schema::schema::LongLogical::TimestampMillis => ArrowDataType::Timestamp( + TimeUnit::Millisecond, + Some(PlSmallStr::from_static("00:00")), + ), + avro_schema::schema::LongLogical::TimestampMicros => ArrowDataType::Timestamp( + TimeUnit::Microsecond, + Some(PlSmallStr::from_static("00:00")), + ), avro_schema::schema::LongLogical::LocalTimestampMillis => { ArrowDataType::Timestamp(TimeUnit::Millisecond, None) }, @@ -118,7 +124,10 @@ fn schema_to_field( .map(|field| { let mut props = Metadata::new(); if let Some(doc) = &field.doc { - props.insert("avro::doc".to_string(), doc.clone()); + props.insert( + PlSmallStr::from_static("avro::doc"), + PlSmallStr::from_str(doc), + ); } schema_to_field(&field.schema, Some(&field.name), props) }) @@ -127,7 +136,7 @@ fn schema_to_field( }, AvroSchema::Enum { .. } => { return Ok(Field::new( - name.unwrap_or_default(), + PlSmallStr::from_str(name.unwrap_or_default()), ArrowDataType::Dictionary(IntegerType::Int32, Box::new(ArrowDataType::Utf8), false), false, )) @@ -147,5 +156,5 @@ fn schema_to_field( let name = name.unwrap_or_default(); - Ok(Field::new(name, data_type, nullable).with_metadata(props)) + Ok(Field::new(PlSmallStr::from_str(name), data_type, nullable).with_metadata(props)) } diff --git a/crates/polars-arrow/src/io/avro/write/schema.rs b/crates/polars-arrow/src/io/avro/write/schema.rs index 8171798a692c..03e28c6d2acc 100644 --- a/crates/polars-arrow/src/io/avro/write/schema.rs +++ b/crates/polars-arrow/src/io/avro/write/schema.rs @@ -25,7 +25,7 @@ pub fn to_record(schema: &ArrowSchema, name: String) -> PolarsResult { fn field_to_field(field: &Field, name_counter: &mut i32) -> PolarsResult { let schema = type_to_schema(field.data_type(), field.is_nullable, name_counter)?; - Ok(AvroField::new(&field.name, schema)) + Ok(AvroField::new(field.name.to_string(), schema)) } fn type_to_schema( diff --git a/crates/polars-arrow/src/io/ipc/read/common.rs b/crates/polars-arrow/src/io/ipc/read/common.rs index d2cfb407f3c4..7316e7262cb2 100644 --- a/crates/polars-arrow/src/io/ipc/read/common.rs +++ b/crates/polars-arrow/src/io/ipc/read/common.rs @@ -3,6 +3,7 @@ use std::io::{Read, Seek}; use polars_error::{polars_bail, polars_err, PolarsResult}; use polars_utils::aliases::PlHashMap; +use polars_utils::pl_str::PlSmallStr; use super::deserialize::{read, skip}; use super::Dictionaries; @@ -279,7 +280,11 @@ pub fn read_dictionary( }; // Make a fake schema for the dictionary batch. - let fields = vec![Field::new("", value_type.clone(), false)]; + let fields = vec![Field::new( + PlSmallStr::const_default(), + value_type.clone(), + false, + )]; let ipc_schema = IpcSchema { fields: vec![first_ipc_field.clone()], is_little_endian: ipc_schema.is_little_endian, diff --git a/crates/polars-arrow/src/io/ipc/read/schema.rs b/crates/polars-arrow/src/io/ipc/read/schema.rs index a6c1743e6a0b..655455a9606a 100644 --- a/crates/polars-arrow/src/io/ipc/read/schema.rs +++ b/crates/polars-arrow/src/io/ipc/read/schema.rs @@ -1,6 +1,7 @@ use arrow_format::ipc::planus::ReadAsRoot; use arrow_format::ipc::{FieldRef, FixedSizeListRef, MapRef, TimeRef, TimestampRef, UnionRef}; use polars_error::{polars_bail, polars_err, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use super::super::{IpcField, IpcSchema}; use super::{OutOfSpecKind, StreamMetadata}; @@ -31,10 +32,11 @@ fn deserialize_field(ipc_field: arrow_format::ipc::FieldRef) -> PolarsResult<(Fi let (data_type, ipc_field_) = get_data_type(ipc_field, extension, true)?; let field = Field { - name: ipc_field - .name()? - .ok_or_else(|| polars_err!(oos = "Every field in IPC must have a name"))? - .to_string(), + name: PlSmallStr::from_str( + ipc_field + .name()? + .ok_or_else(|| polars_err!(oos = "Every field in IPC must have a name"))?, + ), data_type, is_nullable: ipc_field.nullable()?, metadata, @@ -49,7 +51,7 @@ fn read_metadata(field: &arrow_format::ipc::FieldRef) -> PolarsResult for kv in list { let kv = kv?; if let (Some(k), Some(v)) = (kv.key()?, kv.value()?) { - metadata_map.insert(k.to_string(), v.to_string()); + metadata_map.insert(PlSmallStr::from_str(k), PlSmallStr::from_str(v)); } } metadata_map @@ -100,10 +102,10 @@ fn deserialize_time(time: TimeRef) -> PolarsResult<(ArrowDataType, IpcField)> { } fn deserialize_timestamp(timestamp: TimestampRef) -> PolarsResult<(ArrowDataType, IpcField)> { - let timezone = timestamp.timezone()?.map(|tz| tz.to_string()); + let timezone = timestamp.timezone()?; let time_unit = deserialize_timeunit(timestamp.unit()?)?; Ok(( - ArrowDataType::Timestamp(time_unit, timezone), + ArrowDataType::Timestamp(time_unit, timezone.map(PlSmallStr::from_str)), IpcField::default(), )) } @@ -397,7 +399,7 @@ pub(super) fn fb_to_schema( let v_str = kv.value()?; if let Some(k) = k_str { if let Some(v) = v_str { - metadata.insert(k.to_string(), v.to_string()); + metadata.insert(PlSmallStr::from_str(k), PlSmallStr::from_str(v)); } } } diff --git a/crates/polars-arrow/src/io/ipc/write/schema.rs b/crates/polars-arrow/src/io/ipc/write/schema.rs index 8243e07a7d04..192b0fb5d6ec 100644 --- a/crates/polars-arrow/src/io/ipc/write/schema.rs +++ b/crates/polars-arrow/src/io/ipc/write/schema.rs @@ -41,7 +41,7 @@ pub fn serialize_schema( let custom_metadata = schema .metadata .iter() - .map(|(k, v)| key_value(k, v)) + .map(|(k, v)| key_value(k.clone().into_string(), v.clone().into_string())) .collect::>(); let custom_metadata = (!custom_metadata.is_empty()).then_some(custom_metadata); @@ -63,22 +63,22 @@ fn key_value(key: impl Into, val: impl Into) -> arrow_format::ip fn write_metadata(metadata: &Metadata, kv_vec: &mut Vec) { for (k, v) in metadata { - if k != "ARROW:extension:name" && k != "ARROW:extension:metadata" { - kv_vec.push(key_value(k, v)); + if k.as_str() != "ARROW:extension:name" && k.as_str() != "ARROW:extension:metadata" { + kv_vec.push(key_value(k.clone().into_string(), v.clone().into_string())); } } } fn write_extension( name: &str, - metadata: &Option, + metadata: Option<&str>, kv_vec: &mut Vec, ) { if let Some(metadata) = metadata { - kv_vec.push(key_value("ARROW:extension:metadata", metadata)); + kv_vec.push(key_value("ARROW:extension:metadata".to_string(), metadata)); } - kv_vec.push(key_value("ARROW:extension:name", name)); + kv_vec.push(key_value("ARROW:extension:name".to_string(), name)); } /// Create an IPC Field from an Arrow Field @@ -86,7 +86,11 @@ pub(crate) fn serialize_field(field: &Field, ipc_field: &IpcField) -> arrow_form // custom metadata. let mut kv_vec = vec![]; if let ArrowDataType::Extension(name, _, metadata) = field.data_type() { - write_extension(name, metadata, &mut kv_vec); + write_extension( + name.as_str(), + metadata.as_ref().map(|x| x.as_str()), + &mut kv_vec, + ); } let type_ = serialize_type(field.data_type()); @@ -95,7 +99,11 @@ pub(crate) fn serialize_field(field: &Field, ipc_field: &IpcField) -> arrow_form let dictionary = if let ArrowDataType::Dictionary(index_type, inner, is_ordered) = field.data_type() { if let ArrowDataType::Extension(name, _, metadata) = inner.as_ref() { - write_extension(name, metadata, &mut kv_vec); + write_extension( + name.as_str(), + metadata.as_ref().map(|x| x.as_str()), + &mut kv_vec, + ); } Some(serialize_dictionary( index_type, @@ -117,7 +125,7 @@ pub(crate) fn serialize_field(field: &Field, ipc_field: &IpcField) -> arrow_form }; arrow_format::ipc::Field { - name: Some(field.name.clone()), + name: Some(field.name.to_string()), nullable: field.is_nullable, type_: Some(type_), dictionary: dictionary.map(Box::new), @@ -218,7 +226,7 @@ fn serialize_type(data_type: &ArrowDataType) -> arrow_format::ipc::Type { })), Timestamp(unit, tz) => ipc::Type::Timestamp(Box::new(ipc::Timestamp { unit: serialize_time_unit(unit), - timezone: tz.as_ref().cloned(), + timezone: tz.as_ref().map(|x| x.to_string()), })), Interval(unit) => ipc::Type::Interval(Box::new(ipc::Interval { unit: match unit { diff --git a/crates/polars-arrow/src/mmap/mod.rs b/crates/polars-arrow/src/mmap/mod.rs index bda45655d08c..9229352c0fcb 100644 --- a/crates/polars-arrow/src/mmap/mod.rs +++ b/crates/polars-arrow/src/mmap/mod.rs @@ -7,6 +7,7 @@ mod array; use arrow_format::ipc::planus::ReadAsRoot; use arrow_format::ipc::{Block, MessageRef, RecordBatchRef}; use polars_error::{polars_bail, polars_err, to_compute_err, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use crate::array::Array; use crate::datatypes::{ArrowDataType, Field}; @@ -185,7 +186,7 @@ unsafe fn mmap_dictionary>( }; // Make a fake schema for the dictionary batch. - let field = Field::new("", value_type.clone(), false); + let field = Field::new(PlSmallStr::const_default(), value_type.clone(), false); let chunk = _mmap_record( &[field], diff --git a/crates/polars-arrow/src/temporal_conversions.rs b/crates/polars-arrow/src/temporal_conversions.rs index 487996094f37..b5672f6dd626 100644 --- a/crates/polars-arrow/src/temporal_conversions.rs +++ b/crates/polars-arrow/src/temporal_conversions.rs @@ -3,6 +3,7 @@ use chrono::format::{parse, Parsed, StrftimeItems}; use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta}; use polars_error::{polars_err, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use crate::array::{PrimitiveArray, Utf8ViewArray}; use crate::datatypes::{ArrowDataType, TimeUnit}; @@ -318,7 +319,7 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> fn utf8view_to_timestamp_impl( array: &Utf8ViewArray, fmt: &str, - time_zone: String, + time_zone: PlSmallStr, tz: T, time_unit: TimeUnit, ) -> PrimitiveArray { @@ -354,10 +355,10 @@ pub const fn time_unit_multiple(unit: TimeUnit) -> i64 { fn chrono_tz_utf_to_timestamp( array: &Utf8ViewArray, fmt: &str, - time_zone: String, + time_zone: PlSmallStr, time_unit: TimeUnit, ) -> PolarsResult> { - let tz = parse_offset_tz(&time_zone)?; + let tz = parse_offset_tz(time_zone.as_str())?; Ok(utf8view_to_timestamp_impl( array, fmt, time_zone, tz, time_unit, )) @@ -367,7 +368,7 @@ fn chrono_tz_utf_to_timestamp( fn chrono_tz_utf_to_timestamp( _: &Utf8ViewArray, _: &str, - timezone: String, + timezone: PlSmallStr, _: TimeUnit, ) -> PolarsResult> { panic!("timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)") @@ -389,7 +390,7 @@ fn chrono_tz_utf_to_timestamp( pub(crate) fn utf8view_to_timestamp( array: &Utf8ViewArray, fmt: &str, - time_zone: String, + time_zone: PlSmallStr, time_unit: TimeUnit, ) -> PolarsResult> { let tz = parse_offset(time_zone.as_str()); diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index 204f022ff3ae..456d63ffae40 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -35,7 +35,6 @@ regex = { workspace = true, optional = true } # activate if you want serde support for Series and DataFrames serde = { workspace = true, optional = true } serde_json = { workspace = true, optional = true } -smartstring = { workspace = true } thiserror = { workspace = true } xxhash-rust = { workspace = true } @@ -117,8 +116,8 @@ dtype-struct = [] bigidx = ["arrow/bigidx", "polars-utils/bigidx"] python = [] -serde = ["dep:serde", "smartstring/serde", "bitflags/serde"] -serde-lazy = ["serde", "arrow/serde", "indexmap/serde", "smartstring/serde", "chrono/serde"] +serde = ["dep:serde", "bitflags/serde"] +serde-lazy = ["serde", "arrow/serde", "indexmap/serde", "chrono/serde"] docs-selection = [ "ndarray", diff --git a/crates/polars-core/src/chunked_array/arithmetic/mod.rs b/crates/polars-core/src/chunked_array/arithmetic/mod.rs index becea8c9b616..e45c12ef12f1 100644 --- a/crates/polars-core/src/chunked_array/arithmetic/mod.rs +++ b/crates/polars-core/src/chunked_array/arithmetic/mod.rs @@ -76,7 +76,7 @@ impl Add for &BinaryChunked { unsafe { std::mem::transmute::<_, &'static [u8]>(out) } }) }, - None => BinaryChunked::full_null(self.name(), self.len()), + None => BinaryChunked::full_null(self.name().clone(), self.len()), }; } // broadcasting path lhs @@ -91,7 +91,7 @@ impl Add for &BinaryChunked { // ref is valid for the lifetime of this closure. unsafe { std::mem::transmute::<_, &'static [u8]>(out) } }), - None => BinaryChunked::full_null(self.name(), rhs.len()), + None => BinaryChunked::full_null(self.name().clone(), rhs.len()), }; } @@ -137,7 +137,7 @@ impl Add for &BooleanChunked { let rhs = rhs.get(0); return match rhs { Some(rhs) => unary_elementwise_values(self, |v| v as IdxSize + rhs as IdxSize), - None => IdxCa::full_null(self.name(), self.len()), + None => IdxCa::full_null(self.name().clone(), self.len()), }; } // Broadcasting path lhs. @@ -161,9 +161,9 @@ pub(crate) mod test { use crate::prelude::*; pub(crate) fn create_two_chunked() -> (Int32Chunked, Int32Chunked) { - let mut a1 = Int32Chunked::new("a", &[1, 2, 3]); - let a2 = Int32Chunked::new("a", &[4, 5, 6]); - let a3 = Int32Chunked::new("a", &[1, 2, 3, 4, 5, 6]); + let mut a1 = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let a2 = Int32Chunked::new(PlSmallStr::from_static("a"), &[4, 5, 6]); + let a3 = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3, 4, 5, 6]); a1.append(&a2).unwrap(); (a1, a3) } diff --git a/crates/polars-core/src/chunked_array/array/iterator.rs b/crates/polars-core/src/chunked_array/array/iterator.rs index 52785f1208db..49a013d03dbb 100644 --- a/crates/polars-core/src/chunked_array/array/iterator.rs +++ b/crates/polars-core/src/chunked_array/array/iterator.rs @@ -26,7 +26,7 @@ impl ArrayChunked { /// The lifetime of [AmortSeries] is bound to the iterator. Keeping it alive /// longer than the iterator is UB. pub fn amortized_iter(&self) -> AmortizedListIter> + '_> { - self.amortized_iter_with_name("") + self.amortized_iter_with_name(PlSmallStr::const_default()) } /// This is an iterator over a [`ArrayChunked`] that save allocations. @@ -44,7 +44,7 @@ impl ArrayChunked { /// will be set. pub fn amortized_iter_with_name( &self, - name: &str, + name: PlSmallStr, ) -> AmortizedListIter> + '_> { // we create the series container from the inner array // so that the container has the proper dtype. @@ -84,7 +84,7 @@ impl ArrayChunked { { if self.is_empty() { return Ok(Series::new_empty( - self.name(), + self.name().clone(), &DataType::List(Box::new(self.inner_dtype().clone())), ) .list() @@ -109,7 +109,7 @@ impl ArrayChunked { }) .collect::>()? }; - ca.rename(self.name()); + ca.rename(self.name().clone()); if fast_explode { ca.set_fast_explode(); } @@ -135,7 +135,7 @@ impl ArrayChunked { to_arr(&out) }) }) - .collect_ca_with_dtype(self.name(), self.dtype().clone()) + .collect_ca_with_dtype(self.name().clone(), self.dtype().clone()) } /// Try apply a closure `F` to each array. @@ -158,7 +158,7 @@ impl ArrayChunked { }) .transpose() }) - .try_collect_ca_with_dtype(self.name(), self.dtype().clone()) + .try_collect_ca_with_dtype(self.name().clone(), self.dtype().clone()) } /// Zip with a `ChunkedArray` then apply a binary function `F` elementwise. @@ -184,7 +184,7 @@ impl ArrayChunked { let out = f(opt_s, opt_v); out.map(|s| to_arr(&s)) }) - .collect_ca_with_dtype(self.name(), self.dtype().clone()) + .collect_ca_with_dtype(self.name().clone(), self.dtype().clone()) } /// Apply a closure `F` elementwise. @@ -196,7 +196,7 @@ impl ArrayChunked { V::Array: ArrayFromIter>, { { - self.amortized_iter().map(f).collect_ca(self.name()) + self.amortized_iter().map(f).collect_ca(self.name().clone()) } } @@ -208,7 +208,9 @@ impl ArrayChunked { V::Array: ArrayFromIter>, { { - self.amortized_iter().map(f).try_collect_ca(self.name()) + self.amortized_iter() + .map(f) + .try_collect_ca(self.name().clone()) } } diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index a3b7a1a1f339..e327449e4124 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -34,7 +34,9 @@ impl ArrayChunked { let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect(); // SAFETY: Data type of arrays matches because they are chunks from the same array. - unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, self.inner_dtype()) } + unsafe { + Series::from_chunks_and_dtype_unchecked(self.name().clone(), chunks, self.inner_dtype()) + } } /// Ignore the list indices and apply `func` to the inner type as [`Series`]. @@ -46,12 +48,12 @@ impl ArrayChunked { let ca = self.rechunk(); let field = self .inner_dtype() - .to_arrow_field("item", CompatLevel::newest()); + .to_arrow_field(PlSmallStr::from_static("item"), CompatLevel::newest()); let chunks = ca.downcast_iter().map(|arr| { let elements = unsafe { Series::_try_from_arrow_unchecked_with_md( - self.name(), + self.name().clone(), vec![(*arr.values()).clone()], &field.data_type, Some(&field.metadata), @@ -76,6 +78,6 @@ impl ArrayChunked { Ok(arr) }); - ArrayChunked::try_from_chunk_iter(self.name(), chunks) + ArrayChunked::try_from_chunk_iter(self.name().clone(), chunks) } } diff --git a/crates/polars-core/src/chunked_array/bitwise.rs b/crates/polars-core/src/chunked_array/bitwise.rs index 9e8fc482498c..5238c4f035e7 100644 --- a/crates/polars-core/src/chunked_array/bitwise.rs +++ b/crates/polars-core/src/chunked_array/bitwise.rs @@ -71,10 +71,10 @@ impl BitOr for &BooleanChunked { (1, 1) => {}, (1, _) => { return match self.get(0) { - Some(true) => BooleanChunked::full(self.name(), true, rhs.len()), + Some(true) => BooleanChunked::full(self.name().clone(), true, rhs.len()), Some(false) => { let mut rhs = rhs.clone(); - rhs.rename(self.name()); + rhs.rename(self.name().clone()); rhs }, None => &self.new_from_index(0, rhs.len()) | rhs, @@ -82,7 +82,7 @@ impl BitOr for &BooleanChunked { }, (_, 1) => { return match rhs.get(0) { - Some(true) => BooleanChunked::full(self.name(), true, self.len()), + Some(true) => BooleanChunked::full(self.name().clone(), true, self.len()), Some(false) => self.clone(), None => &rhs.new_from_index(0, self.len()) | self, }; @@ -114,12 +114,12 @@ impl BitXor for &BooleanChunked { return match self.get(0) { Some(true) => { let mut rhs = rhs.not(); - rhs.rename(self.name()); + rhs.rename(self.name().clone()); rhs }, Some(false) => { let mut rhs = rhs.clone(); - rhs.rename(self.name()); + rhs.rename(self.name().clone()); rhs }, None => &self.new_from_index(0, rhs.len()) | rhs, @@ -161,15 +161,15 @@ impl BitAnd for &BooleanChunked { (1, 1) => {}, (1, _) => { return match self.get(0) { - Some(true) => rhs.clone().with_name(self.name()), - Some(false) => BooleanChunked::full(self.name(), false, rhs.len()), + Some(true) => rhs.clone().with_name(self.name().clone()), + Some(false) => BooleanChunked::full(self.name().clone(), false, rhs.len()), None => &self.new_from_index(0, rhs.len()) & rhs, }; }, (_, 1) => { return match rhs.get(0) { Some(true) => self.clone(), - Some(false) => BooleanChunked::full(self.name(), false, self.len()), + Some(false) => BooleanChunked::full(self.name().clone(), false, self.len()), None => self & &rhs.new_from_index(0, self.len()), }; }, @@ -195,8 +195,8 @@ mod test { #[test] fn guard_so_issue_2494() { // this cause a stack overflow - let a = BooleanChunked::new("a", [None]); - let b = BooleanChunked::new("b", [None]); + let a = BooleanChunked::new(PlSmallStr::from_static("a"), [None]); + let b = BooleanChunked::new(PlSmallStr::from_static("b"), [None]); assert_eq!((&a).bitand(&b).null_count(), 1); assert_eq!((&a).bitor(&b).null_count(), 1); diff --git a/crates/polars-core/src/chunked_array/builder/boolean.rs b/crates/polars-core/src/chunked_array/builder/boolean.rs index 031a45c8d74f..649db3d6252e 100644 --- a/crates/polars-core/src/chunked_array/builder/boolean.rs +++ b/crates/polars-core/src/chunked_array/builder/boolean.rs @@ -30,7 +30,7 @@ impl ChunkedBuilder for BooleanChunkedBuilder { } impl BooleanChunkedBuilder { - pub fn new(name: &str, capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize) -> Self { BooleanChunkedBuilder { array_builder: MutableBooleanArray::with_capacity(capacity), field: Field::new(name, DataType::Boolean), diff --git a/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs b/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs index e235d08ffbd6..64cccf3b7f36 100644 --- a/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs +++ b/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs @@ -1,13 +1,13 @@ use arrow::types::NativeType; +use polars_utils::pl_str::PlSmallStr; use polars_utils::unwrap::UnwrapUncheckedRelease; -use smartstring::alias::String as SmartString; use crate::prelude::*; pub(crate) struct FixedSizeListNumericBuilder { inner: Option>>, width: usize, - name: SmartString, + name: PlSmallStr, logical_dtype: DataType, } @@ -16,7 +16,7 @@ impl FixedSizeListNumericBuilder { /// /// The caller must ensure that the physical numerical type match logical type. pub(crate) unsafe fn new( - name: &str, + name: PlSmallStr, width: usize, capacity: usize, logical_dtype: DataType, @@ -26,7 +26,7 @@ impl FixedSizeListNumericBuilder { Self { inner, width, - name: name.into(), + name, logical_dtype, } } @@ -77,7 +77,7 @@ impl FixedSizeListBuilder for FixedSizeListNumericBuilder { // SAFETY: physical type matches the logical unsafe { ChunkedArray::from_chunks_and_dtype( - self.name.as_str(), + self.name.clone(), vec![Box::new(arr)], DataType::Array(Box::new(self.logical_dtype.clone()), self.width), ) @@ -87,13 +87,13 @@ impl FixedSizeListBuilder for FixedSizeListNumericBuilder { pub(crate) struct AnonymousOwnedFixedSizeListBuilder { inner: fixed_size_list::AnonymousBuilder, - name: SmartString, + name: PlSmallStr, inner_dtype: Option, } impl AnonymousOwnedFixedSizeListBuilder { pub(crate) fn new( - name: &str, + name: PlSmallStr, width: usize, capacity: usize, inner_dtype: Option, @@ -101,7 +101,7 @@ impl AnonymousOwnedFixedSizeListBuilder { let inner = fixed_size_list::AnonymousBuilder::new(capacity, width); Self { inner, - name: name.into(), + name, inner_dtype, } } @@ -128,7 +128,7 @@ impl FixedSizeListBuilder for AnonymousOwnedFixedSizeListBuilder { .as_ref(), ) .unwrap(); - ChunkedArray::with_chunk(self.name.as_str(), arr) + ChunkedArray::with_chunk(self.name.clone(), arr) } } @@ -136,7 +136,7 @@ pub(crate) fn get_fixed_size_list_builder( inner_type_logical: &DataType, capacity: usize, width: usize, - name: &str, + name: PlSmallStr, ) -> PolarsResult> { let phys_dtype = inner_type_logical.to_physical(); diff --git a/crates/polars-core/src/chunked_array/builder/list/anonymous.rs b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs index 38b6a3891148..ee74d5e6ec97 100644 --- a/crates/polars-core/src/chunked_array/builder/list/anonymous.rs +++ b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs @@ -1,7 +1,7 @@ use super::*; pub struct AnonymousListBuilder<'a> { - name: String, + name: PlSmallStr, builder: AnonymousBuilder<'a>, fast_explode: bool, inner_dtype: DtypeMerger, @@ -9,14 +9,14 @@ pub struct AnonymousListBuilder<'a> { impl Default for AnonymousListBuilder<'_> { fn default() -> Self { - Self::new("", 0, None) + Self::new(PlSmallStr::const_default(), 0, None) } } impl<'a> AnonymousListBuilder<'a> { - pub fn new(name: &str, capacity: usize, inner_dtype: Option) -> Self { + pub fn new(name: PlSmallStr, capacity: usize, inner_dtype: Option) -> Self { Self { - name: name.into(), + name, builder: AnonymousBuilder::new(capacity), fast_explode: true, inner_dtype: DtypeMerger::new(inner_dtype), @@ -74,7 +74,7 @@ impl<'a> AnonymousListBuilder<'a> { let slf = std::mem::take(self); if slf.builder.is_empty() { ListChunked::full_null_with_dtype( - &slf.name, + slf.name.clone(), 0, &slf.inner_dtype.materialize().unwrap_or(DataType::Null), ) @@ -91,18 +91,18 @@ impl<'a> AnonymousListBuilder<'a> { Some(dt) => DataType::List(Box::new(dt)), }; - let mut ca = ListChunked::with_chunk("", arr); + let mut ca = ListChunked::with_chunk(PlSmallStr::const_default(), arr); if slf.fast_explode { ca.set_fast_explode(); } - ca.field = Arc::new(Field::new(&slf.name, list_dtype_logical)); + ca.field = Arc::new(Field::new(slf.name.clone(), list_dtype_logical)); ca } } } pub struct AnonymousOwnedListBuilder { - name: String, + name: PlSmallStr, builder: AnonymousBuilder<'static>, owned: Vec, inner_dtype: DtypeMerger, @@ -111,7 +111,7 @@ pub struct AnonymousOwnedListBuilder { impl Default for AnonymousOwnedListBuilder { fn default() -> Self { - Self::new("", 0, None) + Self::new(PlSmallStr::const_default(), 0, None) } } @@ -151,19 +151,19 @@ impl ListBuilderTrait for AnonymousOwnedListBuilder { Some(dt) => DataType::List(Box::new(dt)), }; - let mut ca = ListChunked::with_chunk("", arr); + let mut ca = ListChunked::with_chunk(PlSmallStr::const_default(), arr); if slf.fast_explode { ca.set_fast_explode(); } - ca.field = Arc::new(Field::new(&slf.name, list_dtype_logical)); + ca.field = Arc::new(Field::new(slf.name.clone(), list_dtype_logical)); ca } } impl AnonymousOwnedListBuilder { - pub fn new(name: &str, capacity: usize, inner_dtype: Option) -> Self { + pub fn new(name: PlSmallStr, capacity: usize, inner_dtype: Option) -> Self { Self { - name: name.into(), + name, builder: AnonymousBuilder::new(capacity), owned: Vec::with_capacity(capacity), inner_dtype: DtypeMerger::new(inner_dtype), diff --git a/crates/polars-core/src/chunked_array/builder/list/binary.rs b/crates/polars-core/src/chunked_array/builder/list/binary.rs index 6382d9269f49..d55a69a2eace 100644 --- a/crates/polars-core/src/chunked_array/builder/list/binary.rs +++ b/crates/polars-core/src/chunked_array/builder/list/binary.rs @@ -7,7 +7,7 @@ pub struct ListStringChunkedBuilder { } impl ListStringChunkedBuilder { - pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize, values_capacity: usize) -> Self { let values = MutableBinaryViewArray::with_capacity(values_capacity); let builder = LargeListBinViewBuilder::new_with_capacity(values, capacity); let field = Field::new(name, DataType::List(Box::new(DataType::String))); @@ -97,7 +97,7 @@ pub struct ListBinaryChunkedBuilder { } impl ListBinaryChunkedBuilder { - pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize, values_capacity: usize) -> Self { let values = MutablePlBinary::with_capacity(values_capacity); let builder = LargeListBinViewBuilder::new_with_capacity(values, capacity); let field = Field::new(name, DataType::List(Box::new(DataType::Binary))); diff --git a/crates/polars-core/src/chunked_array/builder/list/boolean.rs b/crates/polars-core/src/chunked_array/builder/list/boolean.rs index 1d83a05ace00..8142d1a50954 100644 --- a/crates/polars-core/src/chunked_array/builder/list/boolean.rs +++ b/crates/polars-core/src/chunked_array/builder/list/boolean.rs @@ -7,7 +7,7 @@ pub struct ListBooleanChunkedBuilder { } impl ListBooleanChunkedBuilder { - pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize, values_capacity: usize) -> Self { let values = MutableBooleanArray::with_capacity(values_capacity); let builder = LargeListBooleanBuilder::new_with_capacity(values, capacity); let field = Field::new(name, DataType::List(Box::new(DataType::Boolean))); diff --git a/crates/polars-core/src/chunked_array/builder/list/categorical.rs b/crates/polars-core/src/chunked_array/builder/list/categorical.rs index 748e8ea859b0..3670e0ab3df9 100644 --- a/crates/polars-core/src/chunked_array/builder/list/categorical.rs +++ b/crates/polars-core/src/chunked_array/builder/list/categorical.rs @@ -1,7 +1,7 @@ use super::*; pub fn create_categorical_chunked_listbuilder( - name: &str, + name: PlSmallStr, ordering: CategoricalOrdering, capacity: usize, values_capacity: usize, @@ -33,7 +33,7 @@ pub struct ListEnumCategoricalChunkedBuilder { impl ListEnumCategoricalChunkedBuilder { pub(super) fn new( - name: &str, + name: PlSmallStr, ordering: CategoricalOrdering, capacity: usize, values_capacity: usize, @@ -91,7 +91,7 @@ impl ListLocalCategoricalChunkedBuilder { } pub(super) fn new( - name: &str, + name: PlSmallStr, ordering: CategoricalOrdering, capacity: usize, values_capacity: usize, @@ -206,7 +206,7 @@ struct ListGlobalCategoricalChunkedBuilder { impl ListGlobalCategoricalChunkedBuilder { pub(super) fn new( - name: &str, + name: PlSmallStr, ordering: CategoricalOrdering, capacity: usize, values_capacity: usize, diff --git a/crates/polars-core/src/chunked_array/builder/list/mod.rs b/crates/polars-core/src/chunked_array/builder/list/mod.rs index 9a7f9243dcf3..645a2a168e90 100644 --- a/crates/polars-core/src/chunked_array/builder/list/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/list/mod.rs @@ -84,7 +84,7 @@ pub fn get_list_builder( inner_type_logical: &DataType, value_capacity: usize, list_capacity: usize, - name: &str, + name: PlSmallStr, ) -> PolarsResult> { match inner_type_logical { #[cfg(feature = "dtype-categorical")] @@ -159,21 +159,21 @@ pub fn get_list_builder( macro_rules! get_bool_builder { () => {{ let builder = - ListBooleanChunkedBuilder::new(&name, list_capacity, value_capacity); + ListBooleanChunkedBuilder::new(name, list_capacity, value_capacity); Box::new(builder) }}; } macro_rules! get_string_builder { () => {{ let builder = - ListStringChunkedBuilder::new(&name, list_capacity, 5 * value_capacity); + ListStringChunkedBuilder::new(name, list_capacity, 5 * value_capacity); Box::new(builder) }}; } macro_rules! get_binary_builder { () => {{ let builder = - ListBinaryChunkedBuilder::new(&name, list_capacity, 5 * value_capacity); + ListBinaryChunkedBuilder::new(name, list_capacity, 5 * value_capacity); Box::new(builder) }}; } diff --git a/crates/polars-core/src/chunked_array/builder/list/null.rs b/crates/polars-core/src/chunked_array/builder/list/null.rs index ab6e7a73ec7b..233f53e17412 100644 --- a/crates/polars-core/src/chunked_array/builder/list/null.rs +++ b/crates/polars-core/src/chunked_array/builder/list/null.rs @@ -2,14 +2,14 @@ use super::*; pub struct ListNullChunkedBuilder { builder: LargeListNullBuilder, - name: String, + name: PlSmallStr, } impl ListNullChunkedBuilder { - pub fn new(name: &str, capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize) -> Self { ListNullChunkedBuilder { builder: LargeListNullBuilder::with_capacity(capacity), - name: name.into(), + name, } } @@ -41,7 +41,7 @@ impl ListBuilderTrait for ListNullChunkedBuilder { fn finish(&mut self) -> ListChunked { unsafe { ListChunked::from_chunks_and_dtype_unchecked( - &self.name, + self.name.clone(), vec![self.builder.as_box()], DataType::List(Box::new(DataType::Null)), ) diff --git a/crates/polars-core/src/chunked_array/builder/list/primitive.rs b/crates/polars-core/src/chunked_array/builder/list/primitive.rs index d9555716d45d..0b1de987efb4 100644 --- a/crates/polars-core/src/chunked_array/builder/list/primitive.rs +++ b/crates/polars-core/src/chunked_array/builder/list/primitive.rs @@ -14,7 +14,7 @@ where T: PolarsNumericType, { pub fn new( - name: &str, + name: PlSmallStr, capacity: usize, values_capacity: usize, logical_type: DataType, @@ -31,7 +31,7 @@ where } pub fn new_with_values_type( - name: &str, + name: PlSmallStr, capacity: usize, values_capacity: usize, values_type: DataType, diff --git a/crates/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs index bac88f5a1ea5..a2a36b469bb1 100644 --- a/crates/polars-core/src/chunked_array/builder/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/mod.rs @@ -46,36 +46,36 @@ where let chunks = iter .into_iter() .map(|(values, opt_buffer)| to_primitive::(values, opt_buffer)); - ChunkedArray::from_chunk_iter("from_iter", chunks) + ChunkedArray::from_chunk_iter(PlSmallStr::const_default(), chunks) } } pub trait NewChunkedArray { - fn from_slice(name: &str, v: &[N]) -> Self; - fn from_slice_options(name: &str, opt_v: &[Option]) -> Self; + fn from_slice(name: PlSmallStr, v: &[N]) -> Self; + fn from_slice_options(name: PlSmallStr, opt_v: &[Option]) -> Self; /// Create a new ChunkedArray from an iterator. - fn from_iter_options(name: &str, it: impl Iterator>) -> Self; + fn from_iter_options(name: PlSmallStr, it: impl Iterator>) -> Self; /// Create a new ChunkedArray from an iterator. - fn from_iter_values(name: &str, it: impl Iterator) -> Self; + fn from_iter_values(name: PlSmallStr, it: impl Iterator) -> Self; } impl NewChunkedArray for ChunkedArray where T: PolarsNumericType, { - fn from_slice(name: &str, v: &[T::Native]) -> Self { + fn from_slice(name: PlSmallStr, v: &[T::Native]) -> Self { let arr = PrimitiveArray::from_slice(v).to(T::get_dtype().to_arrow(CompatLevel::newest())); ChunkedArray::with_chunk(name, arr) } - fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + fn from_slice_options(name: PlSmallStr, opt_v: &[Option]) -> Self { Self::from_iter_options(name, opt_v.iter().copied()) } fn from_iter_options( - name: &str, + name: PlSmallStr, it: impl Iterator>, ) -> ChunkedArray { let mut builder = PrimitiveChunkedBuilder::new(name, get_iter_capacity(&it)); @@ -84,7 +84,7 @@ where } /// Create a new ChunkedArray from an iterator. - fn from_iter_values(name: &str, it: impl Iterator) -> ChunkedArray { + fn from_iter_values(name: PlSmallStr, it: impl Iterator) -> ChunkedArray { let ca: NoNull> = it.collect(); let mut ca = ca.into_inner(); ca.rename(name); @@ -93,16 +93,16 @@ where } impl NewChunkedArray for BooleanChunked { - fn from_slice(name: &str, v: &[bool]) -> Self { + fn from_slice(name: PlSmallStr, v: &[bool]) -> Self { Self::from_iter_values(name, v.iter().copied()) } - fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + fn from_slice_options(name: PlSmallStr, opt_v: &[Option]) -> Self { Self::from_iter_options(name, opt_v.iter().copied()) } fn from_iter_options( - name: &str, + name: PlSmallStr, it: impl Iterator>, ) -> ChunkedArray { let mut builder = BooleanChunkedBuilder::new(name, get_iter_capacity(&it)); @@ -111,7 +111,10 @@ impl NewChunkedArray for BooleanChunked { } /// Create a new ChunkedArray from an iterator. - fn from_iter_values(name: &str, it: impl Iterator) -> ChunkedArray { + fn from_iter_values( + name: PlSmallStr, + it: impl Iterator, + ) -> ChunkedArray { let mut ca: ChunkedArray<_> = it.collect(); ca.rename(name); ca @@ -122,23 +125,23 @@ impl NewChunkedArray for StringChunked where S: AsRef, { - fn from_slice(name: &str, v: &[S]) -> Self { + fn from_slice(name: PlSmallStr, v: &[S]) -> Self { let arr = Utf8ViewArray::from_slice_values(v); ChunkedArray::with_chunk(name, arr) } - fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + fn from_slice_options(name: PlSmallStr, opt_v: &[Option]) -> Self { let arr = Utf8ViewArray::from_slice(opt_v); ChunkedArray::with_chunk(name, arr) } - fn from_iter_options(name: &str, it: impl Iterator>) -> Self { + fn from_iter_options(name: PlSmallStr, it: impl Iterator>) -> Self { let arr = MutableBinaryViewArray::from_iterator(it).freeze(); ChunkedArray::with_chunk(name, arr) } /// Create a new ChunkedArray from an iterator. - fn from_iter_values(name: &str, it: impl Iterator) -> Self { + fn from_iter_values(name: PlSmallStr, it: impl Iterator) -> Self { let arr = MutableBinaryViewArray::from_values_iter(it).freeze(); ChunkedArray::with_chunk(name, arr) } @@ -148,23 +151,23 @@ impl NewChunkedArray for BinaryChunked where B: AsRef<[u8]>, { - fn from_slice(name: &str, v: &[B]) -> Self { + fn from_slice(name: PlSmallStr, v: &[B]) -> Self { let arr = BinaryViewArray::from_slice_values(v); ChunkedArray::with_chunk(name, arr) } - fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + fn from_slice_options(name: PlSmallStr, opt_v: &[Option]) -> Self { let arr = BinaryViewArray::from_slice(opt_v); ChunkedArray::with_chunk(name, arr) } - fn from_iter_options(name: &str, it: impl Iterator>) -> Self { + fn from_iter_options(name: PlSmallStr, it: impl Iterator>) -> Self { let arr = MutableBinaryViewArray::from_iterator(it).freeze(); ChunkedArray::with_chunk(name, arr) } /// Create a new ChunkedArray from an iterator. - fn from_iter_values(name: &str, it: impl Iterator) -> Self { + fn from_iter_values(name: PlSmallStr, it: impl Iterator) -> Self { let arr = MutableBinaryViewArray::from_values_iter(it).freeze(); ChunkedArray::with_chunk(name, arr) } @@ -176,7 +179,8 @@ mod test { #[test] fn test_primitive_builder() { - let mut builder = PrimitiveChunkedBuilder::::new("foo", 6); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::from_static("foo"), 6); let values = &[Some(1), None, Some(2), Some(3), None, Some(4)]; for val in values { builder.append_option(*val); @@ -187,12 +191,17 @@ mod test { #[test] fn test_list_builder() { - let mut builder = - ListPrimitiveChunkedBuilder::::new("a", 10, 5, DataType::Int32); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("a"), + 10, + 5, + DataType::Int32, + ); // Create a series containing two chunks. - let mut s1 = Int32Chunked::from_slice("a", &[1, 2, 3]).into_series(); - let s2 = Int32Chunked::from_slice("b", &[4, 5, 6]).into_series(); + let mut s1 = + Int32Chunked::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3]).into_series(); + let s2 = Int32Chunked::from_slice(PlSmallStr::from_static("b"), &[4, 5, 6]).into_series(); s1.append(&s2).unwrap(); builder.append_series(&s1).unwrap(); @@ -215,8 +224,12 @@ mod test { assert_eq!(out.get_as_series(0).unwrap().len(), 6); assert_eq!(out.get_as_series(1).unwrap().len(), 3); - let mut builder = - ListPrimitiveChunkedBuilder::::new("a", 10, 5, DataType::Int32); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("a"), + 10, + 5, + DataType::Int32, + ); builder.append_series(&s1).unwrap(); builder.append_null(); diff --git a/crates/polars-core/src/chunked_array/builder/null.rs b/crates/polars-core/src/chunked_array/builder/null.rs index 8e4d5b9cb107..f4101a2a14e7 100644 --- a/crates/polars-core/src/chunked_array/builder/null.rs +++ b/crates/polars-core/src/chunked_array/builder/null.rs @@ -10,7 +10,7 @@ pub struct NullChunkedBuilder { } impl NullChunkedBuilder { - pub fn new(name: &str, len: usize) -> Self { + pub fn new(name: PlSmallStr, len: usize) -> Self { let array_builder = MutableNullArray::new(len); NullChunkedBuilder { @@ -27,7 +27,7 @@ impl NullChunkedBuilder { pub fn finish(mut self) -> NullChunked { let arr = self.array_builder.as_box(); - let ca = NullChunked::new(Arc::from(self.field.name.as_str()), arr.len()); + let ca = NullChunked::new(self.field.name().clone(), arr.len()); ca } diff --git a/crates/polars-core/src/chunked_array/builder/primitive.rs b/crates/polars-core/src/chunked_array/builder/primitive.rs index 14eb2c1f4f46..f310d4145a19 100644 --- a/crates/polars-core/src/chunked_array/builder/primitive.rs +++ b/crates/polars-core/src/chunked_array/builder/primitive.rs @@ -39,7 +39,7 @@ impl PrimitiveChunkedBuilder where T: PolarsNumericType, { - pub fn new(name: &str, capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize) -> Self { let array_builder = MutablePrimitiveArray::::with_capacity(capacity) .to(T::get_dtype().to_arrow(CompatLevel::newest())); diff --git a/crates/polars-core/src/chunked_array/builder/string.rs b/crates/polars-core/src/chunked_array/builder/string.rs index 36c1d90492bc..8375760c606d 100644 --- a/crates/polars-core/src/chunked_array/builder/string.rs +++ b/crates/polars-core/src/chunked_array/builder/string.rs @@ -18,13 +18,12 @@ pub type StringChunkedBuilder = BinViewChunkedBuilder; pub type BinaryChunkedBuilder = BinViewChunkedBuilder<[u8]>; impl BinViewChunkedBuilder { - /// Create a new StringChunkedBuilder + /// Create a new BinViewChunkedBuilder /// /// # Arguments /// /// * `capacity` - Number of string elements in the final array. - /// * `bytes_capacity` - Number of bytes needed to store the string values. - pub fn new(name: &str, capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize) -> Self { Self { chunk_builder: MutableBinaryViewArray::with_capacity(capacity), field: Arc::new(Field::new(name, DataType::from(&T::DATA_TYPE))), diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index acbfb0839807..4b204f15fbf7 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -70,7 +70,7 @@ pub(crate) fn cast_chunks( } fn cast_impl_inner( - name: &str, + name: PlSmallStr, chunks: &[ArrayRef], dtype: &DataType, options: CastOptions, @@ -98,7 +98,7 @@ fn cast_impl_inner( } fn cast_impl( - name: &str, + name: PlSmallStr, chunks: &[ArrayRef], dtype: &DataType, options: CastOptions, @@ -108,7 +108,7 @@ fn cast_impl( #[cfg(feature = "dtype-struct")] fn cast_single_to_struct( - name: &str, + name: PlSmallStr, chunks: &[ArrayRef], fields: &[Field], options: CastOptions, @@ -117,12 +117,12 @@ fn cast_single_to_struct( // cast to first field dtype let mut fields = fields.iter(); let fld = fields.next().unwrap(); - let s = cast_impl_inner(&fld.name, chunks, &fld.dtype, options)?; + let s = cast_impl_inner(fld.name.clone(), chunks, &fld.dtype, options)?; let length = s.len(); new_fields.push(s); for fld in fields { - new_fields.push(Series::full_null(&fld.name, length, &fld.dtype)); + new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype)); } StructChunked::from_series(name, &new_fields).map(|ca| ca.into_series()) @@ -136,7 +136,11 @@ where if self.dtype() == data_type { // SAFETY: chunks are correct dtype let mut out = unsafe { - Series::from_chunks_and_dtype_unchecked(self.name(), self.chunks.clone(), data_type) + Series::from_chunks_and_dtype_unchecked( + self.name().clone(), + self.chunks.clone(), + data_type, + ) }; out.set_sorted_flag(self.is_sorted_flag()); return Ok(out); @@ -195,30 +199,32 @@ where }, #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => { - cast_single_to_struct(self.name(), &self.chunks, fields, options) + cast_single_to_struct(self.name().clone(), &self.chunks, fields, options) }, - _ => cast_impl_inner(self.name(), &self.chunks, data_type, options).map(|mut s| { - // maintain sorted if data types - // - remain signed - // - unsigned -> signed - // this may still fail with overflow? - let dtype = self.dtype(); - - let to_signed = data_type.is_signed_integer(); - let unsigned2unsigned = - dtype.is_unsigned_integer() && data_type.is_unsigned_integer(); - let allowed = to_signed || unsigned2unsigned; - - if (allowed) + _ => cast_impl_inner(self.name().clone(), &self.chunks, data_type, options).map( + |mut s| { + // maintain sorted if data types + // - remain signed + // - unsigned -> signed + // this may still fail with overflow? + let dtype = self.dtype(); + + let to_signed = data_type.is_signed_integer(); + let unsigned2unsigned = + dtype.is_unsigned_integer() && data_type.is_unsigned_integer(); + let allowed = to_signed || unsigned2unsigned; + + if (allowed) && (s.null_count() == self.null_count()) // physical to logicals || (self.dtype().to_physical() == data_type.to_physical()) - { - let is_sorted = self.is_sorted_flag(); - s.set_sorted_flag(is_sorted) - } - s - }), + { + let is_sorted = self.is_sorted_flag(); + s.set_sorted_flag(is_sorted) + } + s + }, + ), } } } @@ -276,7 +282,7 @@ impl ChunkCast for StringChunked { let iter = unsafe { self.downcast_iter().flatten().trust_my_length(self.len()) }; let builder = - CategoricalChunkedBuilder::new(self.name(), self.len(), *ordering); + CategoricalChunkedBuilder::new(self.name().clone(), self.len(), *ordering); let ca = builder.drain_iter_and_finish(iter); Ok(ca.into_series()) }, @@ -292,13 +298,13 @@ impl ChunkCast for StringChunked { CategoricalChunked::from_string_to_enum(self, rev_map.get_categories(), *ordering) .map(|ca| { let mut s = ca.into_series(); - s.rename(self.name()); + s.rename(self.name().clone()); s }) }, #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => { - cast_single_to_struct(self.name(), &self.chunks, fields, options) + cast_single_to_struct(self.name().clone(), &self.chunks, fields, options) }, #[cfg(feature = "dtype-decimal")] DataType::Decimal(precision, scale) => match (precision, scale) { @@ -310,7 +316,7 @@ impl ChunkCast for StringChunked { *scale, ) }); - Ok(Int128Chunked::from_chunk_iter(self.name(), chunks) + Ok(Int128Chunked::from_chunk_iter(self.name().clone(), chunks) .into_decimal_unchecked(*precision, *scale) .into_series()) }, @@ -322,7 +328,7 @@ impl ChunkCast for StringChunked { #[cfg(feature = "dtype-date")] DataType::Date => { let result = cast_chunks(&self.chunks, data_type, options)?; - let out = Series::try_from((self.name(), result))?; + let out = Series::try_from((self.name().clone(), result))?; Ok(out) }, #[cfg(feature = "dtype-datetime")] @@ -336,7 +342,7 @@ impl ChunkCast for StringChunked { &Datetime(time_unit.to_owned(), Some(time_zone.clone())), options, )?; - Series::try_from((self.name(), result)) + Series::try_from((self.name().clone(), result)) }, _ => { let result = cast_chunks( @@ -344,12 +350,12 @@ impl ChunkCast for StringChunked { &Datetime(time_unit.to_owned(), None), options, )?; - Series::try_from((self.name(), result)) + Series::try_from((self.name().clone(), result)) }, }; out }, - _ => cast_impl(self.name(), &self.chunks, data_type, options), + _ => cast_impl(self.name().clone(), &self.chunks, data_type, options), } } @@ -366,7 +372,7 @@ impl BinaryChunked { .downcast_iter() .map(|arr| arr.to_utf8view_unchecked().boxed()) .collect(); - let field = Arc::new(Field::new(self.name(), DataType::String)); + let field = Arc::new(Field::new(self.name().clone(), DataType::String)); let mut ca = StringChunked::new_with_compute_len(field, chunks); @@ -383,7 +389,7 @@ impl StringChunked { .downcast_iter() .map(|arr| arr.to_binview().boxed()) .collect(); - let field = Arc::new(Field::new(self.name(), DataType::Binary)); + let field = Arc::new(Field::new(self.name().clone(), DataType::Binary)); let mut ca = BinaryChunked::new_with_compute_len(field, chunks); @@ -403,9 +409,9 @@ impl ChunkCast for BinaryChunked { match data_type { #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => { - cast_single_to_struct(self.name(), &self.chunks, fields, options) + cast_single_to_struct(self.name().clone(), &self.chunks, fields, options) }, - _ => cast_impl(self.name(), &self.chunks, data_type, options), + _ => cast_impl(self.name().clone(), &self.chunks, data_type, options), } } @@ -426,9 +432,9 @@ impl ChunkCast for BinaryOffsetChunked { match data_type { #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => { - cast_single_to_struct(self.name(), &self.chunks, fields, options) + cast_single_to_struct(self.name().clone(), &self.chunks, fields, options) }, - _ => cast_impl(self.name(), &self.chunks, data_type, options), + _ => cast_impl(self.name().clone(), &self.chunks, data_type, options), } } @@ -446,9 +452,9 @@ impl ChunkCast for BooleanChunked { match data_type { #[cfg(feature = "dtype-struct")] DataType::Struct(fields) => { - cast_single_to_struct(self.name(), &self.chunks, fields, options) + cast_single_to_struct(self.name().clone(), &self.chunks, fields, options) }, - _ => cast_impl(self.name(), &self.chunks, data_type, options), + _ => cast_impl(self.name().clone(), &self.chunks, data_type, options), } } @@ -483,7 +489,7 @@ impl ChunkCast for ListChunked { // we must take this path to correct for physical types. unsafe { Ok(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![arr], &List(Box::new(child_type)), )) @@ -505,7 +511,7 @@ impl ChunkCast for ListChunked { // we must take this path to correct for physical types. unsafe { Ok(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), chunks, &Array(child_type.clone(), *width), )) @@ -560,7 +566,7 @@ impl ChunkCast for ArrayChunked { // we must take this path to correct for physical types. unsafe { Ok(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![arr], &Array(Box::new(child_type), *width), )) @@ -576,7 +582,7 @@ impl ChunkCast for ArrayChunked { // we must take this path to correct for physical types. unsafe { Ok(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), chunks, &List(child_type.clone()), )) @@ -610,7 +616,11 @@ fn cast_list( let arr = ca.downcast_iter().next().unwrap(); // SAFETY: inner dtype is passed correctly let s = unsafe { - Series::from_chunks_and_dtype_unchecked("", vec![arr.values().clone()], ca.inner_dtype()) + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr.values().clone()], + ca.inner_dtype(), + ) }; let new_inner = s.cast_with_options(child_type, options)?; @@ -635,7 +645,11 @@ unsafe fn cast_list_unchecked(ca: &ListChunked, child_type: &DataType) -> Polars let arr = ca.downcast_iter().next().unwrap(); // SAFETY: inner dtype is passed correctly let s = unsafe { - Series::from_chunks_and_dtype_unchecked("", vec![arr.values().clone()], ca.inner_dtype()) + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr.values().clone()], + ca.inner_dtype(), + ) }; let new_inner = s.cast_unchecked(child_type)?; let new_values = new_inner.array_ref(0).clone(); @@ -648,7 +662,7 @@ unsafe fn cast_list_unchecked(ca: &ListChunked, child_type: &DataType) -> Polars arr.validity().cloned(), ); Ok(ListChunked::from_chunks_and_dtype_unchecked( - ca.name(), + ca.name().clone(), vec![Box::new(new_arr)], DataType::List(Box::new(child_type.clone())), ) @@ -667,7 +681,11 @@ fn cast_fixed_size_list( let arr = ca.downcast_iter().next().unwrap(); // SAFETY: inner dtype is passed correctly let s = unsafe { - Series::from_chunks_and_dtype_unchecked("", vec![arr.values().clone()], ca.inner_dtype()) + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr.values().clone()], + ca.inner_dtype(), + ) }; let new_inner = s.cast_with_options(child_type, options)?; @@ -689,8 +707,12 @@ mod test { #[test] fn test_cast_list() -> PolarsResult<()> { - let mut builder = - ListPrimitiveChunkedBuilder::::new("a", 10, 10, DataType::Int32); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("a"), + 10, + 10, + DataType::Int32, + ); builder.append_opt_slice(Some(&[1i32, 2, 3])); builder.append_opt_slice(Some(&[1i32, 2, 3])); let ca = builder.finish(); @@ -708,7 +730,7 @@ mod test { #[cfg(feature = "dtype-categorical")] fn test_cast_noop() { // check if we can cast categorical twice without panic - let ca = StringChunked::new("foo", &["bar", "ham"]); + let ca = StringChunked::new(PlSmallStr::from_static("foo"), &["bar", "ham"]); let out = ca .cast_with_options( &DataType::Categorical(None, Default::default()), diff --git a/crates/polars-core/src/chunked_array/collect.rs b/crates/polars-core/src/chunked_array/collect.rs index 054f59de8958..eb882b1acc13 100644 --- a/crates/polars-core/src/chunked_array/collect.rs +++ b/crates/polars-core/src/chunked_array/collect.rs @@ -13,6 +13,7 @@ use std::sync::Arc; use arrow::trusted_len::TrustedLen; +use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::ChunkedArray; use crate::datatypes::{ @@ -22,7 +23,7 @@ use crate::prelude::CompatLevel; pub trait ChunkedCollectIterExt: Iterator + Sized { #[inline] - fn collect_ca_with_dtype(self, name: &str, dtype: DataType) -> ChunkedArray + fn collect_ca_with_dtype(self, name: PlSmallStr, dtype: DataType) -> ChunkedArray where T::Array: ArrayFromIterDtype, { @@ -42,7 +43,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { } #[inline] - fn collect_ca_trusted_with_dtype(self, name: &str, dtype: DataType) -> ChunkedArray + fn collect_ca_trusted_with_dtype(self, name: PlSmallStr, dtype: DataType) -> ChunkedArray where T::Array: ArrayFromIterDtype, Self: TrustedLen, @@ -66,7 +67,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { #[inline] fn try_collect_ca_with_dtype( self, - name: &str, + name: PlSmallStr, dtype: DataType, ) -> Result, E> where @@ -95,7 +96,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { #[inline] fn try_collect_ca_trusted_with_dtype( self, - name: &str, + name: PlSmallStr, dtype: DataType, ) -> Result, E> where @@ -128,7 +129,7 @@ impl ChunkedCollectIterExt for I {} pub trait ChunkedCollectInferIterExt: Iterator + Sized { #[inline] - fn collect_ca(self, name: &str) -> ChunkedArray + fn collect_ca(self, name: PlSmallStr) -> ChunkedArray where T::Array: ArrayFromIter, { @@ -138,7 +139,7 @@ pub trait ChunkedCollectInferIterExt: Iterator + Sized { } #[inline] - fn collect_ca_trusted(self, name: &str) -> ChunkedArray + fn collect_ca_trusted(self, name: PlSmallStr) -> ChunkedArray where T::Array: ArrayFromIter, Self: TrustedLen, @@ -149,7 +150,7 @@ pub trait ChunkedCollectInferIterExt: Iterator + Sized { } #[inline] - fn try_collect_ca(self, name: &str) -> Result, E> + fn try_collect_ca(self, name: PlSmallStr) -> Result, E> where T::Array: ArrayFromIter, Self: Iterator>, @@ -160,7 +161,7 @@ pub trait ChunkedCollectInferIterExt: Iterator + Sized { } #[inline] - fn try_collect_ca_trusted(self, name: &str) -> Result, E> + fn try_collect_ca_trusted(self, name: PlSmallStr) -> Result, E> where T::Array: ArrayFromIter, Self: Iterator> + TrustedLen, diff --git a/crates/polars-core/src/chunked_array/comparison/categorical.rs b/crates/polars-core/src/chunked_array/comparison/categorical.rs index 77ddf45a5a69..faa7f619cdb2 100644 --- a/crates/polars-core/src/chunked_array/comparison/categorical.rs +++ b/crates/polars-core/src/chunked_array/comparison/categorical.rs @@ -57,13 +57,13 @@ where .map(|phys| rev_map_r.get_unchecked(phys)) }; let Some(v) = v else { - return Ok(BooleanChunked::full_null(lhs.name(), lhs_len)); + return Ok(BooleanChunked::full_null(lhs.name().clone(), lhs_len)); }; Ok(lhs .iter_str() .map(|opt_s| opt_s.map(|s| compare_str_function(s, v))) - .collect_ca_trusted(lhs.name())) + .collect_ca_trusted(lhs.name().clone())) }, (1, rhs_len) => { // SAFETY: physical is in range of revmap @@ -73,12 +73,12 @@ where .map(|phys| rev_map_l.get_unchecked(phys)) }; let Some(v) = v else { - return Ok(BooleanChunked::full_null(lhs.name(), rhs_len)); + return Ok(BooleanChunked::full_null(lhs.name().clone(), rhs_len)); }; Ok(rhs .iter_str() .map(|opt_s| opt_s.map(|s| compare_str_function(v, s))) - .collect_ca_trusted(lhs.name())) + .collect_ca_trusted(lhs.name().clone())) }, (lhs_len, rhs_len) if lhs_len == rhs_len => Ok(lhs .iter_str() @@ -88,7 +88,7 @@ where (_, None) => None, (Some(l), Some(r)) => Some(compare_str_function(l, r)), }) - .collect_ca_trusted(lhs.name())), + .collect_ca_trusted(lhs.name().clone())), (lhs_len, rhs_len) => { polars_bail!(ComputeError: "Columns are of unequal length: {} vs {}",lhs_len,rhs_len) }, @@ -103,7 +103,7 @@ impl ChunkCompare<&CategoricalChunked> for CategoricalChunked { cat_equality_helper( self, rhs, - |lhs| replace_non_null(lhs.name(), &lhs.physical().chunks, false), + |lhs| replace_non_null(lhs.name().clone(), &lhs.physical().chunks, false), UInt32Chunked::equal, ) } @@ -112,7 +112,7 @@ impl ChunkCompare<&CategoricalChunked> for CategoricalChunked { cat_equality_helper( self, rhs, - |lhs| BooleanChunked::full(lhs.name(), false, lhs.len()), + |lhs| BooleanChunked::full(lhs.name().clone(), false, lhs.len()), UInt32Chunked::equal_missing, ) } @@ -121,7 +121,7 @@ impl ChunkCompare<&CategoricalChunked> for CategoricalChunked { cat_equality_helper( self, rhs, - |lhs| replace_non_null(lhs.name(), &lhs.physical().chunks, true), + |lhs| replace_non_null(lhs.name().clone(), &lhs.physical().chunks, true), UInt32Chunked::not_equal, ) } @@ -130,7 +130,7 @@ impl ChunkCompare<&CategoricalChunked> for CategoricalChunked { cat_equality_helper( self, rhs, - |lhs| BooleanChunked::full(lhs.name(), true, lhs.len()), + |lhs| BooleanChunked::full(lhs.name().clone(), true, lhs.len()), UInt32Chunked::not_equal_missing, ) } @@ -203,7 +203,7 @@ where cat_compare_function(lhs, rhs_cat.categorical().unwrap()) } else if rhs.len() == 1 { match rhs.get(0) { - None => Ok(BooleanChunked::full_null(lhs.name(), lhs.len())), + None => Ok(BooleanChunked::full_null(lhs.name().clone(), lhs.len())), Some(s) => cat_single_str_compare_helper( lhs, s, @@ -224,8 +224,8 @@ impl ChunkCompare<&StringChunked> for CategoricalChunked { cat_str_equality_helper( self, rhs, - |lhs| replace_non_null(lhs.name(), &lhs.physical().chunks, false), - |lhs| BooleanChunked::full_null(lhs.name(), lhs.len()), + |lhs| replace_non_null(lhs.name().clone(), &lhs.physical().chunks, false), + |lhs| BooleanChunked::full_null(lhs.name().clone(), lhs.len()), |s1, s2| CategoricalChunked::equal(s1, s2), UInt32Chunked::equal, StringChunked::equal, @@ -235,7 +235,7 @@ impl ChunkCompare<&StringChunked> for CategoricalChunked { cat_str_equality_helper( self, rhs, - |lhs| BooleanChunked::full(lhs.name(), false, lhs.len()), + |lhs| BooleanChunked::full(lhs.name().clone(), false, lhs.len()), |lhs| lhs.physical().is_null(), |s1, s2| CategoricalChunked::equal_missing(s1, s2), UInt32Chunked::equal_missing, @@ -247,8 +247,8 @@ impl ChunkCompare<&StringChunked> for CategoricalChunked { cat_str_equality_helper( self, rhs, - |lhs| replace_non_null(lhs.name(), &lhs.physical().chunks, true), - |lhs| BooleanChunked::full_null(lhs.name(), lhs.len()), + |lhs| replace_non_null(lhs.name().clone(), &lhs.physical().chunks, true), + |lhs| BooleanChunked::full_null(lhs.name().clone(), lhs.len()), |s1, s2| CategoricalChunked::not_equal(s1, s2), UInt32Chunked::not_equal, StringChunked::not_equal, @@ -258,7 +258,7 @@ impl ChunkCompare<&StringChunked> for CategoricalChunked { cat_str_equality_helper( self, rhs, - |lhs| BooleanChunked::full(lhs.name(), true, lhs.len()), + |lhs| BooleanChunked::full(lhs.name().clone(), true, lhs.len()), |lhs| !lhs.physical().is_null(), |s1, s2| CategoricalChunked::not_equal_missing(s1, s2), UInt32Chunked::not_equal_missing, @@ -371,7 +371,7 @@ where // SAFETY: indexing into bitmap with same length as original array opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) }) })) - .with_name(lhs.name()), + .with_name(lhs.name().clone()), ) } } @@ -383,7 +383,7 @@ impl ChunkCompare<&str> for CategoricalChunked { cat_single_str_equality_helper( self, rhs, - |lhs| replace_non_null(lhs.name(), &lhs.physical().chunks, false), + |lhs| replace_non_null(lhs.name().clone(), &lhs.physical().chunks, false), UInt32Chunked::equal, ) } @@ -392,7 +392,7 @@ impl ChunkCompare<&str> for CategoricalChunked { cat_single_str_equality_helper( self, rhs, - |lhs| BooleanChunked::full(lhs.name(), false, lhs.len()), + |lhs| BooleanChunked::full(lhs.name().clone(), false, lhs.len()), UInt32Chunked::equal_missing, ) } @@ -401,7 +401,7 @@ impl ChunkCompare<&str> for CategoricalChunked { cat_single_str_equality_helper( self, rhs, - |lhs| replace_non_null(lhs.name(), &lhs.physical().chunks, true), + |lhs| replace_non_null(lhs.name().clone(), &lhs.physical().chunks, true), UInt32Chunked::not_equal, ) } @@ -410,7 +410,7 @@ impl ChunkCompare<&str> for CategoricalChunked { cat_single_str_equality_helper( self, rhs, - |lhs| BooleanChunked::full(lhs.name(), true, lhs.len()), + |lhs| BooleanChunked::full(lhs.name().clone(), true, lhs.len()), UInt32Chunked::equal_missing, ) } diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs index 4049992340d1..17500b22131b 100644 --- a/crates/polars-core/src/chunked_array/comparison/mod.rs +++ b/crates/polars-core/src/chunked_array/comparison/mod.rs @@ -30,17 +30,22 @@ where if let Some(value) = rhs.get(0) { self.equal(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.equal(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_eq_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_eq_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -65,7 +70,7 @@ where self, rhs, |a, b| a.tot_eq_missing_kernel(b).into(), - "", + PlSmallStr::const_default(), ), } } @@ -77,17 +82,22 @@ where if let Some(value) = rhs.get(0) { self.not_equal(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.not_equal(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_ne_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_ne_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -112,7 +122,7 @@ where self, rhs, |a, b| a.tot_ne_missing_kernel(b).into(), - "", + PlSmallStr::const_default(), ), } } @@ -124,17 +134,22 @@ where if let Some(value) = rhs.get(0) { self.lt(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.gt(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_lt_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_lt_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -145,17 +160,22 @@ where if let Some(value) = rhs.get(0) { self.lt_eq(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.gt_eq(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_le_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_le_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -172,35 +192,35 @@ impl ChunkCompare<&NullChunked> for NullChunked { type Item = BooleanChunked; fn equal(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full_null(self.name(), get_broadcast_length(self, rhs)) + BooleanChunked::full_null(self.name().clone(), get_broadcast_length(self, rhs)) } fn equal_missing(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full(self.name(), true, get_broadcast_length(self, rhs)) + BooleanChunked::full(self.name().clone(), true, get_broadcast_length(self, rhs)) } fn not_equal(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full_null(self.name(), get_broadcast_length(self, rhs)) + BooleanChunked::full_null(self.name().clone(), get_broadcast_length(self, rhs)) } fn not_equal_missing(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full(self.name(), false, get_broadcast_length(self, rhs)) + BooleanChunked::full(self.name().clone(), false, get_broadcast_length(self, rhs)) } fn gt(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full_null(self.name(), get_broadcast_length(self, rhs)) + BooleanChunked::full_null(self.name().clone(), get_broadcast_length(self, rhs)) } fn gt_eq(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full_null(self.name(), get_broadcast_length(self, rhs)) + BooleanChunked::full_null(self.name().clone(), get_broadcast_length(self, rhs)) } fn lt(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full_null(self.name(), get_broadcast_length(self, rhs)) + BooleanChunked::full_null(self.name().clone(), get_broadcast_length(self, rhs)) } fn lt_eq(&self, rhs: &NullChunked) -> Self::Item { - BooleanChunked::full_null(self.name(), get_broadcast_length(self, rhs)) + BooleanChunked::full_null(self.name().clone(), get_broadcast_length(self, rhs)) } } @@ -224,17 +244,22 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { if let Some(value) = rhs.get(0) { arity::unary_mut_values(self, |arr| arr.tot_eq_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { arity::unary_mut_values(rhs, |arr| arr.tot_eq_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_eq_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_eq_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -263,7 +288,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { self, rhs, |a, b| a.tot_eq_missing_kernel(b).into(), - "", + PlSmallStr::const_default(), ), } } @@ -275,17 +300,22 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { if let Some(value) = rhs.get(0) { arity::unary_mut_values(self, |arr| arr.tot_ne_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { arity::unary_mut_values(rhs, |arr| arr.tot_ne_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_ne_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_ne_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -314,7 +344,7 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { self, rhs, |a, b| a.tot_ne_missing_kernel(b).into(), - "", + PlSmallStr::const_default(), ), } } @@ -326,17 +356,22 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { if let Some(value) = rhs.get(0) { arity::unary_mut_values(self, |arr| arr.tot_lt_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { arity::unary_mut_values(rhs, |arr| arr.tot_gt_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_lt_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_lt_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -347,17 +382,22 @@ impl ChunkCompare<&BooleanChunked> for BooleanChunked { if let Some(value) = rhs.get(0) { arity::unary_mut_values(self, |arr| arr.tot_le_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { arity::unary_mut_values(rhs, |arr| arr.tot_ge_kernel_broadcast(&value).into()) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_le_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_le_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -415,17 +455,22 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { if let Some(value) = rhs.get(0) { self.equal(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.equal(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_eq_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_eq_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -450,7 +495,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { self, rhs, |a, b| a.tot_eq_missing_kernel(b).into(), - "", + PlSmallStr::const_default(), ), } } @@ -462,17 +507,22 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { if let Some(value) = rhs.get(0) { self.not_equal(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.not_equal(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_ne_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_ne_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -497,7 +547,7 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { self, rhs, |a, b| a.tot_ne_missing_kernel(b).into(), - "", + PlSmallStr::const_default(), ), } } @@ -509,17 +559,22 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { if let Some(value) = rhs.get(0) { self.lt(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.gt(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_lt_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_lt_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -530,17 +585,22 @@ impl ChunkCompare<&BinaryChunked> for BinaryChunked { if let Some(value) = rhs.get(0) { self.lt_eq(value) } else { - BooleanChunked::full_null("", self.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), self.len()) } }, (1, _) => { if let Some(value) = self.get(0) { rhs.gt_eq(value) } else { - BooleanChunked::full_null("", rhs.len()) + BooleanChunked::full_null(PlSmallStr::const_default(), rhs.len()) } }, - _ => arity::binary_mut_values(self, rhs, |a, b| a.tot_le_kernel(b).into(), ""), + _ => arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_le_kernel(b).into(), + PlSmallStr::const_default(), + ), } } @@ -560,13 +620,17 @@ where { match (lhs.len(), rhs.len()) { (_, 1) => { - let right = rhs.get_as_series(0).map(|s| s.with_name("")); + let right = rhs + .get_as_series(0) + .map(|s| s.with_name(PlSmallStr::const_default())); lhs.amortized_iter() .map(|left| op(left.as_ref().map(|us| us.as_ref()), right.as_ref())) .collect_trusted() }, (1, _) => { - let left = lhs.get_as_series(0).map(|s| s.with_name("")); + let left = lhs + .get_as_series(0) + .map(|s| s.with_name(PlSmallStr::const_default())); rhs.amortized_iter() .map(|right| op(left.as_ref(), right.as_ref().map(|us| us.as_ref()))) .collect_trusted() @@ -657,7 +721,7 @@ where { if a.len() != b.len() || a.struct_fields().len() != b.struct_fields().len() { // polars_ensure!(a.len() == 1 || b.len() == 1, ShapeMismatch: "length lhs: {}, length rhs: {}", a.len(), b.len()); - BooleanChunked::full("", value, a.len()) + BooleanChunked::full(PlSmallStr::const_default(), value, a.len()) } else { let (a, b) = align_chunks_binary(a, b); let mut out = a @@ -729,30 +793,50 @@ impl ChunkCompare<&ArrayChunked> for ArrayChunked { type Item = BooleanChunked; fn equal(&self, rhs: &ArrayChunked) -> BooleanChunked { if self.width() != rhs.width() { - return BooleanChunked::full("", false, self.len()); + return BooleanChunked::full(PlSmallStr::const_default(), false, self.len()); } - arity::binary_mut_values(self, rhs, |a, b| a.tot_eq_kernel(b).into(), "") + arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_eq_kernel(b).into(), + PlSmallStr::const_default(), + ) } fn equal_missing(&self, rhs: &ArrayChunked) -> BooleanChunked { if self.width() != rhs.width() { - return BooleanChunked::full("", false, self.len()); + return BooleanChunked::full(PlSmallStr::const_default(), false, self.len()); } - arity::binary_mut_with_options(self, rhs, |a, b| a.tot_eq_missing_kernel(b).into(), "") + arity::binary_mut_with_options( + self, + rhs, + |a, b| a.tot_eq_missing_kernel(b).into(), + PlSmallStr::const_default(), + ) } fn not_equal(&self, rhs: &ArrayChunked) -> BooleanChunked { if self.width() != rhs.width() { - return BooleanChunked::full("", true, self.len()); + return BooleanChunked::full(PlSmallStr::const_default(), true, self.len()); } - arity::binary_mut_values(self, rhs, |a, b| a.tot_ne_kernel(b).into(), "") + arity::binary_mut_values( + self, + rhs, + |a, b| a.tot_ne_kernel(b).into(), + PlSmallStr::const_default(), + ) } fn not_equal_missing(&self, rhs: &ArrayChunked) -> Self::Item { if self.width() != rhs.width() { - return BooleanChunked::full("", true, self.len()); + return BooleanChunked::full(PlSmallStr::const_default(), true, self.len()); } - arity::binary_mut_with_options(self, rhs, |a, b| a.tot_ne_missing_kernel(b).into(), "") + arity::binary_mut_with_options( + self, + rhs, + |a, b| a.tot_ne_missing_kernel(b).into(), + PlSmallStr::const_default(), + ) } // following are not implemented because gt, lt comparison of series don't make sense @@ -778,7 +862,7 @@ impl Not for &BooleanChunked { fn not(self) -> Self::Output { let chunks = self.downcast_iter().map(compute::boolean::not); - ChunkedArray::from_chunk_iter(self.name(), chunks) + ChunkedArray::from_chunk_iter(self.name().clone(), chunks) } } @@ -913,17 +997,20 @@ mod test { use crate::prelude::*; pub(crate) fn create_two_chunked() -> (Int32Chunked, Int32Chunked) { - let mut a1 = Int32Chunked::new("a", &[1, 2, 3]); - let a2 = Int32Chunked::new("a", &[4, 5, 6]); - let a3 = Int32Chunked::new("a", &[1, 2, 3, 4, 5, 6]); + let mut a1 = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let a2 = Int32Chunked::new(PlSmallStr::from_static("a"), &[4, 5, 6]); + let a3 = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3, 4, 5, 6]); a1.append(&a2).unwrap(); (a1, a3) } #[test] fn test_bitwise_ops() { - let a = BooleanChunked::new("a", &[true, false, false]); - let b = BooleanChunked::new("b", &[Some(true), Some(true), None]); + let a = BooleanChunked::new(PlSmallStr::from_static("a"), &[true, false, false]); + let b = BooleanChunked::new( + PlSmallStr::from_static("b"), + &[Some(true), Some(true), None], + ); assert_eq!(Vec::from(&a | &b), &[Some(true), Some(true), None]); assert_eq!(Vec::from(&a & &b), &[Some(true), Some(false), Some(false)]); assert_eq!(Vec::from(!b), &[Some(false), Some(false), None]); @@ -1131,9 +1218,13 @@ mod test { #[test] fn test_kleene() { - let a = BooleanChunked::new("", &[Some(true), Some(false), None]); - let trues = BooleanChunked::from_slice("", &[true, true, true]); - let falses = BooleanChunked::from_slice("", &[false, false, false]); + let a = BooleanChunked::new( + PlSmallStr::const_default(), + &[Some(true), Some(false), None], + ); + let trues = BooleanChunked::from_slice(PlSmallStr::const_default(), &[true, true, true]); + let falses = + BooleanChunked::from_slice(PlSmallStr::const_default(), &[false, false, false]); let c = &a | &trues; assert_eq!(Vec::from(&c), &[Some(true), Some(true), Some(true)]); @@ -1144,9 +1235,9 @@ mod test { #[test] fn list_broadcasting_lists() { - let s_el = Series::new("", &[1, 2, 3]); - let s_lhs = Series::new("", &[s_el.clone(), s_el.clone()]); - let s_rhs = Series::new("", &[s_el.clone()]); + let s_el = Series::new(PlSmallStr::const_default(), &[1, 2, 3]); + let s_lhs = Series::new(PlSmallStr::const_default(), &[s_el.clone(), s_el.clone()]); + let s_rhs = Series::new(PlSmallStr::const_default(), &[s_el.clone()]); let result = s_lhs.list().unwrap().equal(s_rhs.list().unwrap()); assert_eq!(result.len(), 2); @@ -1155,9 +1246,9 @@ mod test { #[test] fn test_broadcasting_bools() { - let a = BooleanChunked::from_slice("", &[true, false, true]); - let true_ = BooleanChunked::from_slice("", &[true]); - let false_ = BooleanChunked::from_slice("", &[false]); + let a = BooleanChunked::from_slice(PlSmallStr::const_default(), &[true, false, true]); + let true_ = BooleanChunked::from_slice(PlSmallStr::const_default(), &[true]); + let false_ = BooleanChunked::from_slice(PlSmallStr::const_default(), &[false]); let out = a.equal(&true_); assert_eq!(Vec::from(&out), &[Some(true), Some(false), Some(true)]); @@ -1213,9 +1304,13 @@ mod test { let out = false_.lt_eq(&a); assert_eq!(Vec::from(&out), &[Some(true), Some(true), Some(true)]); - let a = BooleanChunked::from_slice_options("", &[Some(true), Some(false), None]); - let all_true = BooleanChunked::from_slice("", &[true, true, true]); - let all_false = BooleanChunked::from_slice("", &[false, false, false]); + let a = BooleanChunked::from_slice_options( + PlSmallStr::const_default(), + &[Some(true), Some(false), None], + ); + let all_true = BooleanChunked::from_slice(PlSmallStr::const_default(), &[true, true, true]); + let all_false = + BooleanChunked::from_slice(PlSmallStr::const_default(), &[false, false, false]); let out = a.equal(&true_); assert_eq!(Vec::from(&out), &[Some(true), Some(false), None]); let out = a.not_equal(&true_); @@ -1237,9 +1332,9 @@ mod test { #[test] fn test_broadcasting_numeric() { - let a = Int32Chunked::from_slice("", &[1, 2, 3]); - let one = Int32Chunked::from_slice("", &[1]); - let three = Int32Chunked::from_slice("", &[3]); + let a = Int32Chunked::from_slice(PlSmallStr::const_default(), &[1, 2, 3]); + let one = Int32Chunked::from_slice(PlSmallStr::const_default(), &[1]); + let three = Int32Chunked::from_slice(PlSmallStr::const_default(), &[3]); let out = a.equal(&one); assert_eq!(Vec::from(&out), &[Some(true), Some(false), Some(false)]); diff --git a/crates/polars-core/src/chunked_array/comparison/scalar.rs b/crates/polars-core/src/chunked_array/comparison/scalar.rs index f47f23780f82..92f39c924b22 100644 --- a/crates/polars-core/src/chunked_array/comparison/scalar.rs +++ b/crates/polars-core/src/chunked_array/comparison/scalar.rs @@ -56,7 +56,7 @@ where BooleanArray::from_data_default(mask.into(), None) }); - let mut ca = BooleanChunked::from_chunk_iter(ca.name(), chunks); + let mut ca = BooleanChunked::from_chunk_iter(ca.name().clone(), chunks); ca.set_sorted_flag(output_order.unwrap_or(IsSorted::Ascending)); ca } @@ -235,7 +235,7 @@ mod test { #[test] fn test_binary_search_cmp() { - let mut s = Series::new("", &[1, 1, 2, 2, 4, 8]); + let mut s = Series::new(PlSmallStr::const_default(), &[1, 1, 2, 2, 4, 8]); s.set_sorted_flag(IsSorted::Ascending); let out = s.gt(10).unwrap(); assert!(!out.any()); @@ -246,12 +246,18 @@ mod test { let out = s.gt(2).unwrap(); assert_eq!( out.into_series(), - Series::new("", [false, false, false, false, true, true]) + Series::new( + PlSmallStr::const_default(), + [false, false, false, false, true, true] + ) ); let out = s.gt(3).unwrap(); assert_eq!( out.into_series(), - Series::new("", [false, false, false, false, true, true]) + Series::new( + PlSmallStr::const_default(), + [false, false, false, false, true, true] + ) ); let out = s.gt_eq(10).unwrap(); @@ -262,12 +268,18 @@ mod test { let out = s.gt_eq(2).unwrap(); assert_eq!( out.into_series(), - Series::new("", [false, false, true, true, true, true]) + Series::new( + PlSmallStr::const_default(), + [false, false, true, true, true, true] + ) ); let out = s.gt_eq(3).unwrap(); assert_eq!( out.into_series(), - Series::new("", [false, false, false, false, true, true]) + Series::new( + PlSmallStr::const_default(), + [false, false, false, false, true, true] + ) ); let out = s.lt(10).unwrap(); @@ -278,12 +290,18 @@ mod test { let out = s.lt(2).unwrap(); assert_eq!( out.into_series(), - Series::new("", [true, true, false, false, false, false]) + Series::new( + PlSmallStr::const_default(), + [true, true, false, false, false, false] + ) ); let out = s.lt(3).unwrap(); assert_eq!( out.into_series(), - Series::new("", [true, true, true, true, false, false]) + Series::new( + PlSmallStr::const_default(), + [true, true, true, true, false, false] + ) ); let out = s.lt_eq(10).unwrap(); @@ -294,12 +312,18 @@ mod test { let out = s.lt_eq(2).unwrap(); assert_eq!( out.into_series(), - Series::new("", [true, true, true, true, false, false]) + Series::new( + PlSmallStr::const_default(), + [true, true, true, true, false, false] + ) ); let out = s.lt(3).unwrap(); assert_eq!( out.into_series(), - Series::new("", [true, true, true, true, false, false]) + Series::new( + PlSmallStr::const_default(), + [true, true, true, true, false, false] + ) ); } } diff --git a/crates/polars-core/src/chunked_array/float.rs b/crates/polars-core/src/chunked_array/float.rs index dc09024b704c..8376629cc403 100644 --- a/crates/polars-core/src/chunked_array/float.rs +++ b/crates/polars-core/src/chunked_array/float.rs @@ -30,7 +30,7 @@ where let chunks = self .downcast_iter() .map(|arr| set_at_nulls(arr, T::Native::nan())); - ChunkedArray::from_chunk_iter(self.name(), chunks) + ChunkedArray::from_chunk_iter(self.name().clone(), chunks) } } diff --git a/crates/polars-core/src/chunked_array/from.rs b/crates/polars-core/src/chunked_array/from.rs index 74f12ccc58ce..999fea5ba7af 100644 --- a/crates/polars-core/src/chunked_array/from.rs +++ b/crates/polars-core/src/chunked_array/from.rs @@ -27,7 +27,7 @@ fn from_chunks_list_dtype(chunks: &mut Vec, dtype: DataType) -> DataTy let values_arr = list_arr.values(); let cat = unsafe { Series::_try_from_arrow_unchecked( - "", + PlSmallStr::const_default(), vec![values_arr.clone()], values_arr.data_type(), ) @@ -59,7 +59,7 @@ fn from_chunks_list_dtype(chunks: &mut Vec, dtype: DataType) -> DataTy let values_arr = list_arr.values(); let cat = unsafe { Series::_try_from_arrow_unchecked( - "", + PlSmallStr::const_default(), vec![values_arr.clone()], values_arr.data_type(), ) @@ -88,7 +88,7 @@ where A: Array, { fn from(arr: A) -> Self { - Self::with_chunk("", arr) + Self::with_chunk(PlSmallStr::const_default(), arr) } } @@ -96,7 +96,7 @@ impl ChunkedArray where T: PolarsDataType, { - pub fn with_chunk(name: &str, arr: A) -> Self + pub fn with_chunk(name: PlSmallStr, arr: A) -> Self where A: Array, T: PolarsDataType, @@ -112,7 +112,7 @@ where Self::from_chunk_iter_like(ca, std::iter::once(arr)) } - pub fn from_chunk_iter(name: &str, iter: I) -> Self + pub fn from_chunk_iter(name: PlSmallStr, iter: I) -> Self where I: IntoIterator, T: PolarsDataType::Item>, @@ -135,10 +135,12 @@ where .into_iter() .map(|x| Box::new(x) as Box) .collect(); - unsafe { Self::from_chunks_and_dtype_unchecked(ca.name(), chunks, ca.dtype().clone()) } + unsafe { + Self::from_chunks_and_dtype_unchecked(ca.name().clone(), chunks, ca.dtype().clone()) + } } - pub fn try_from_chunk_iter(name: &str, iter: I) -> Result + pub fn try_from_chunk_iter(name: PlSmallStr, iter: I) -> Result where I: IntoIterator>, T: PolarsDataType, @@ -187,7 +189,7 @@ where /// /// # Safety /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`. - pub unsafe fn from_chunks(name: &str, mut chunks: Vec) -> Self { + pub unsafe fn from_chunks(name: PlSmallStr, mut chunks: Vec) -> Self { let dtype = match T::get_dtype() { dtype @ DataType::List(_) => from_chunks_list_dtype(&mut chunks, dtype), #[cfg(feature = "dtype-array")] @@ -210,7 +212,7 @@ where /// # Safety /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`. pub unsafe fn from_chunks_and_dtype( - name: &str, + name: PlSmallStr, chunks: Vec, dtype: DataType, ) -> Self { @@ -230,7 +232,7 @@ where } pub(crate) unsafe fn from_chunks_and_dtype_unchecked( - name: &str, + name: PlSmallStr, chunks: Vec, dtype: DataType, ) -> Self { @@ -252,12 +254,16 @@ where T: PolarsNumericType, { /// Create a new ChunkedArray by taking ownership of the Vec. This operation is zero copy. - pub fn from_vec(name: &str, v: Vec) -> Self { + pub fn from_vec(name: PlSmallStr, v: Vec) -> Self { Self::with_chunk(name, to_primitive::(v, None)) } /// Create a new ChunkedArray from a Vec and a validity mask. - pub fn from_vec_validity(name: &str, values: Vec, buffer: Option) -> Self { + pub fn from_vec_validity( + name: PlSmallStr, + values: Vec, + buffer: Option, + ) -> Self { let arr = to_array::(values, buffer); ChunkedArray::new_with_compute_len(Arc::new(Field::new(name, T::get_dtype())), vec![arr]) } @@ -267,7 +273,7 @@ where /// # Safety /// The lifetime will be bound to the lifetime of the slice. /// This will not be checked by the borrowchecker. - pub unsafe fn mmap_slice(name: &str, values: &[T::Native]) -> Self { + pub unsafe fn mmap_slice(name: PlSmallStr, values: &[T::Native]) -> Self { Self::with_chunk(name, arrow::ffi::mmap::slice(values)) } } @@ -278,7 +284,7 @@ impl BooleanChunked { /// # Safety /// The lifetime will be bound to the lifetime of the slice. /// This will not be checked by the borrowchecker. - pub unsafe fn mmap_slice(name: &str, values: &[u8], offset: usize, len: usize) -> Self { + pub unsafe fn mmap_slice(name: PlSmallStr, values: &[u8], offset: usize, len: usize) -> Self { let arr = arrow::ffi::mmap::bitmap(values, offset, len).unwrap(); Self::with_chunk(name, arr) } diff --git a/crates/polars-core/src/chunked_array/from_iterator.rs b/crates/polars-core/src/chunked_array/from_iterator.rs index 8d2499983d73..0035c1b289f7 100644 --- a/crates/polars-core/src/chunked_array/from_iterator.rs +++ b/crates/polars-core/src/chunked_array/from_iterator.rs @@ -20,7 +20,7 @@ where #[inline] fn from_iter>>(iter: I) -> Self { // TODO: eliminate this FromIterator implementation entirely. - iter.into_iter().collect_ca("") + iter.into_iter().collect_ca(PlSmallStr::const_default()) } } @@ -35,7 +35,7 @@ where fn from_iter>(iter: I) -> Self { // 2021-02-07: aligned vec was ~2x faster than arrow collect. let av = iter.into_iter().collect::>(); - NoNull::new(ChunkedArray::from_vec("", av)) + NoNull::new(ChunkedArray::from_vec(PlSmallStr::const_default(), av)) } } @@ -49,14 +49,14 @@ impl FromIterator> for ChunkedArray { impl FromIterator for BooleanChunked { #[inline] fn from_iter>(iter: I) -> Self { - iter.into_iter().collect_ca("") + iter.into_iter().collect_ca(PlSmallStr::const_default()) } } impl FromIterator for NoNull { #[inline] fn from_iter>(iter: I) -> Self { - NoNull::new(iter.into_iter().collect_ca("")) + NoNull::new(iter.into_iter().collect_ca(PlSmallStr::const_default())) } } @@ -69,7 +69,7 @@ where #[inline] fn from_iter>>(iter: I) -> Self { let arr = MutableBinaryViewArray::from_iterator(iter.into_iter()).freeze(); - ChunkedArray::with_chunk("", arr) + ChunkedArray::with_chunk(PlSmallStr::const_default(), arr) } } @@ -95,7 +95,7 @@ where #[inline] fn from_iter>(iter: I) -> Self { let arr = MutableBinaryViewArray::from_values_iter(iter.into_iter()).freeze(); - ChunkedArray::with_chunk("", arr) + ChunkedArray::with_chunk(PlSmallStr::const_default(), arr) } } @@ -107,7 +107,7 @@ where #[inline] fn from_iter>>(iter: I) -> Self { let arr = MutableBinaryViewArray::from_iter(iter).freeze(); - ChunkedArray::with_chunk("", arr) + ChunkedArray::with_chunk(PlSmallStr::const_default(), arr) } } @@ -118,7 +118,7 @@ where #[inline] fn from_iter>(iter: I) -> Self { let arr = MutableBinaryViewArray::from_values_iter(iter.into_iter()).freeze(); - ChunkedArray::with_chunk("", arr) + ChunkedArray::with_chunk(PlSmallStr::const_default(), arr) } } @@ -134,11 +134,16 @@ where // first take one to get the dtype. let v = match it.next() { Some(v) => v, - None => return ListChunked::full_null("", 0), + None => return ListChunked::full_null(PlSmallStr::const_default(), 0), }; // We don't know the needed capacity. We arbitrarily choose an average of 5 elements per series. - let mut builder = - get_list_builder(v.borrow().dtype(), capacity * 5, capacity, "collected").unwrap(); + let mut builder = get_list_builder( + v.borrow().dtype(), + capacity * 5, + capacity, + PlSmallStr::const_default(), + ) + .unwrap(); builder.append_series(v.borrow()).unwrap(); for s in it { @@ -166,7 +171,9 @@ impl FromIterator> for ListChunked { Some(None) => { init_null_count += 1; }, - None => return ListChunked::full_null("", init_null_count), + None => { + return ListChunked::full_null(PlSmallStr::const_default(), init_null_count) + }, } } @@ -182,7 +189,8 @@ impl FromIterator> for ListChunked { // the empty arrays is then not added (we add an extra offset instead) // the next non-empty series then must have the correct dtype. if matches!(first_s.dtype(), DataType::Null) && first_s.is_empty() { - let mut builder = AnonymousOwnedListBuilder::new("collected", capacity, None); + let mut builder = + AnonymousOwnedListBuilder::new(PlSmallStr::const_default(), capacity, None); for _ in 0..init_null_count { builder.append_null(); } @@ -196,8 +204,11 @@ impl FromIterator> for ListChunked { match first_s.dtype() { #[cfg(feature = "object")] DataType::Object(_, _) => { - let mut builder = - first_s.get_list_builder("collected", capacity * 5, capacity); + let mut builder = first_s.get_list_builder( + PlSmallStr::const_default(), + capacity * 5, + capacity, + ); for _ in 0..init_null_count { builder.append_null(); } @@ -214,7 +225,7 @@ impl FromIterator> for ListChunked { first_s.dtype(), capacity * 5, capacity, - "collected", + PlSmallStr::const_default(), ) .unwrap(); @@ -238,7 +249,7 @@ impl FromIterator> for ListChunked { impl FromIterator>> for ListChunked { #[inline] fn from_iter>>>(iter: I) -> Self { - iter.into_iter().collect_ca("collected") + iter.into_iter().collect_ca(PlSmallStr::const_default()) } } @@ -274,7 +285,10 @@ impl FromIterator> for ObjectChunked { len, }); ChunkedArray::new_with_compute_len( - Arc::new(Field::new("", get_object_type::())), + Arc::new(Field::new( + PlSmallStr::const_default(), + get_object_type::(), + )), vec![arr], ) } diff --git a/crates/polars-core/src/chunked_array/from_iterator_par.rs b/crates/polars-core/src/chunked_array/from_iterator_par.rs index eaf45d1d651f..e9e7f787cf40 100644 --- a/crates/polars-core/src/chunked_array/from_iterator_par.rs +++ b/crates/polars-core/src/chunked_array/from_iterator_par.rs @@ -72,7 +72,7 @@ where let vectors = collect_into_linked_list_vec(iter); let vectors = vectors.into_iter().collect::>(); let values = flatten_par(&vectors); - NoNull::new(ChunkedArray::new_vec("", values)) + NoNull::new(ChunkedArray::new_vec(PlSmallStr::const_default(), values)) } } @@ -82,21 +82,21 @@ where { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutablePrimitiveArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } impl FromParallelIterator for BooleanChunked { fn from_par_iter>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBooleanArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } impl FromParallelIterator> for BooleanChunked { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBooleanArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } @@ -106,7 +106,7 @@ where { fn from_par_iter>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } @@ -116,7 +116,7 @@ where { fn from_par_iter>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } @@ -126,7 +126,7 @@ where { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } @@ -136,12 +136,12 @@ where { fn from_par_iter>>(iter: I) -> Self { let chunks = collect_into_linked_list(iter, MutableBinaryViewArray::new); - Self::from_chunk_iter("", chunks).optional_rechunk() + Self::from_chunk_iter(PlSmallStr::const_default(), chunks).optional_rechunk() } } pub trait FromParIterWithDtype { - fn from_par_iter_with_dtype(iter: I, name: &str, dtype: DataType) -> Self + fn from_par_iter_with_dtype(iter: I, name: PlSmallStr, dtype: DataType) -> Self where I: IntoParallelIterator, Self: Sized; @@ -171,7 +171,7 @@ fn get_dtype(vectors: &LinkedList>>) -> DataType { } fn materialize_list( - name: &str, + name: PlSmallStr, vectors: &LinkedList>>, dtype: DataType, value_capacity: usize, @@ -217,15 +217,25 @@ impl FromParallelIterator> for ListChunked { let value_capacity = get_value_cap(&vectors); let dtype = get_dtype(&vectors); if let DataType::Null = dtype { - ListChunked::full_null_with_dtype("", list_capacity, &DataType::Null) + ListChunked::full_null_with_dtype( + PlSmallStr::const_default(), + list_capacity, + &DataType::Null, + ) } else { - materialize_list("", &vectors, dtype, value_capacity, list_capacity) + materialize_list( + PlSmallStr::const_default(), + &vectors, + dtype, + value_capacity, + list_capacity, + ) } } } impl FromParIterWithDtype> for ListChunked { - fn from_par_iter_with_dtype(iter: I, name: &str, dtype: DataType) -> Self + fn from_par_iter_with_dtype(iter: I, name: PlSmallStr, dtype: DataType) -> Self where I: IntoParallelIterator>, Self: Sized, @@ -245,7 +255,7 @@ impl FromParIterWithDtype> for ListChunked { pub trait ChunkedCollectParIterExt: ParallelIterator { fn collect_ca_with_dtype>( self, - name: &str, + name: PlSmallStr, dtype: DataType, ) -> B where @@ -264,7 +274,7 @@ where T: Send, E: Send, { - fn from_par_iter_with_dtype(par_iter: I, name: &str, dtype: DataType) -> Self + fn from_par_iter_with_dtype(par_iter: I, name: PlSmallStr, dtype: DataType) -> Self where I: IntoParallelIterator>, { diff --git a/crates/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs index addf1c7796d2..1e184d5b0746 100644 --- a/crates/polars-core/src/chunked_array/iterator/mod.rs +++ b/crates/polars-core/src/chunked_array/iterator/mod.rs @@ -220,7 +220,7 @@ impl<'a> IntoIterator for &'a ListChunked { .trust_my_length(self.len()) .map(move |arr| { Some(Series::from_chunks_and_dtype_unchecked( - "", + PlSmallStr::const_default(), vec![arr], dtype, )) @@ -236,7 +236,11 @@ impl<'a> IntoIterator for &'a ListChunked { .trust_my_length(self.len()) .map(move |arr| { arr.map(|arr| { - Series::from_chunks_and_dtype_unchecked("", vec![arr], dtype) + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr], + dtype, + ) }) }), ) @@ -256,7 +260,13 @@ impl ListChunked { unsafe { self.downcast_iter() .flat_map(|arr| arr.values_iter()) - .map(move |arr| Series::from_chunks_and_dtype_unchecked("", vec![arr], inner_type)) + .map(move |arr| { + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr], + inner_type, + ) + }) .trust_my_length(self.len()) } } @@ -278,7 +288,7 @@ impl<'a> IntoIterator for &'a ArrayChunked { .trust_my_length(self.len()) .map(move |arr| { Some(Series::from_chunks_and_dtype_unchecked( - "", + PlSmallStr::const_default(), vec![arr], dtype, )) @@ -294,7 +304,11 @@ impl<'a> IntoIterator for &'a ArrayChunked { .trust_my_length(self.len()) .map(move |arr| { arr.map(|arr| { - Series::from_chunks_and_dtype_unchecked("", vec![arr], dtype) + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr], + dtype, + ) }) }), ) @@ -336,7 +350,7 @@ impl<'a> Iterator for FixedSizeListIterNoNull<'a> { self.current += 1; unsafe { Some(Series::from_chunks_and_dtype_unchecked( - "", + PlSmallStr::const_default(), vec![self.array.value_unchecked(old)], &self.inner_type, )) @@ -360,7 +374,13 @@ impl<'a> DoubleEndedIterator for FixedSizeListIterNoNull<'a> { } else { self.current_end -= 1; unsafe { - Some(Series::try_from(("", self.array.value_unchecked(self.current_end))).unwrap()) + Some( + Series::try_from(( + PlSmallStr::const_default(), + self.array.value_unchecked(self.current_end), + )) + .unwrap(), + ) } } } @@ -456,8 +476,8 @@ mod test { #[test] fn out_of_bounds() { - let mut a = UInt32Chunked::from_slice("a", &[1, 2, 3]); - let b = UInt32Chunked::from_slice("a", &[1, 2, 3]); + let mut a = UInt32Chunked::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3]); + let b = UInt32Chunked::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3]); a.append(&b).unwrap(); let v = a.into_iter().collect::>(); @@ -482,7 +502,10 @@ mod test { ($test_name:ident, $ca_type:ty, $first_val:expr, $second_val:expr, $third_val:expr) => { #[test] fn $test_name() { - let a = <$ca_type>::from_slice("test", &[$first_val, $second_val, $third_val]); + let a = <$ca_type>::from_slice( + PlSmallStr::from_static("test"), + &[$first_val, $second_val, $third_val], + ); // normal iterator let mut it = a.into_iter(); @@ -543,7 +566,10 @@ mod test { ($test_name:ident, $ca_type:ty, $first_val:expr, $second_val:expr, $third_val:expr) => { #[test] fn $test_name() { - let a = <$ca_type>::new("test", &[$first_val, $second_val, $third_val]); + let a = <$ca_type>::new( + PlSmallStr::from_static("test"), + &[$first_val, $second_val, $third_val], + ); // normal iterator let mut it = a.into_iter(); @@ -622,8 +648,11 @@ mod test { ($test_name:ident, $ca_type:ty, $first_val:expr, $second_val:expr, $third_val:expr) => { #[test] fn $test_name() { - let mut a = <$ca_type>::from_slice("test", &[$first_val, $second_val]); - let a_b = <$ca_type>::from_slice("", &[$third_val]); + let mut a = <$ca_type>::from_slice( + PlSmallStr::from_static("test"), + &[$first_val, $second_val], + ); + let a_b = <$ca_type>::from_slice(PlSmallStr::const_default(), &[$third_val]); a.append(&a_b).unwrap(); // normal iterator @@ -685,8 +714,9 @@ mod test { ($test_name:ident, $ca_type:ty, $first_val:expr, $second_val:expr, $third_val:expr) => { #[test] fn $test_name() { - let mut a = <$ca_type>::new("test", &[$first_val, $second_val]); - let a_b = <$ca_type>::new("", &[$third_val]); + let mut a = + <$ca_type>::new(PlSmallStr::from_static("test"), &[$first_val, $second_val]); + let a_b = <$ca_type>::new(PlSmallStr::const_default(), &[$third_val]); a.append(&a_b).unwrap(); // normal iterator @@ -766,7 +796,10 @@ mod test { ($test_name:ident, $ca_type:ty, $first_val:expr, $second_val:expr, $third_val:expr) => { #[test] fn $test_name() { - let a = <$ca_type>::from_slice("test", &[$first_val, $second_val, $third_val]); + let a = <$ca_type>::from_slice( + PlSmallStr::from_static("test"), + &[$first_val, $second_val, $third_val], + ); // normal iterator let mut it = a.into_no_null_iter(); @@ -839,8 +872,11 @@ mod test { ($test_name:ident, $ca_type:ty, $first_val:expr, $second_val:expr, $third_val:expr) => { #[test] fn $test_name() { - let mut a = <$ca_type>::from_slice("test", &[$first_val, $second_val]); - let a_b = <$ca_type>::from_slice("", &[$third_val]); + let mut a = <$ca_type>::from_slice( + PlSmallStr::from_static("test"), + &[$first_val, $second_val], + ); + let a_b = <$ca_type>::from_slice(PlSmallStr::const_default(), &[$third_val]); a.append(&a_b).unwrap(); // normal iterator @@ -946,7 +982,10 @@ mod test { } impl_test_iter_skip!(utf8_iter_single_chunk_skip, 8, Some("0"), Some("9"), { - StringChunked::from_slice("test", &generate_utf8_vec(SKIP_ITERATOR_SIZE)) + StringChunked::from_slice( + PlSmallStr::from_static("test"), + &generate_utf8_vec(SKIP_ITERATOR_SIZE), + ) }); impl_test_iter_skip!( @@ -954,19 +993,36 @@ mod test { 8, Some("0"), None, - { StringChunked::new("test", &generate_opt_utf8_vec(SKIP_ITERATOR_SIZE)) } + { + StringChunked::new( + PlSmallStr::from_static("test"), + &generate_opt_utf8_vec(SKIP_ITERATOR_SIZE), + ) + } ); impl_test_iter_skip!(utf8_iter_many_chunk_skip, 18, Some("0"), Some("9"), { - let mut a = StringChunked::from_slice("test", &generate_utf8_vec(SKIP_ITERATOR_SIZE)); - let a_b = StringChunked::from_slice("test", &generate_utf8_vec(SKIP_ITERATOR_SIZE)); + let mut a = StringChunked::from_slice( + PlSmallStr::from_static("test"), + &generate_utf8_vec(SKIP_ITERATOR_SIZE), + ); + let a_b = StringChunked::from_slice( + PlSmallStr::from_static("test"), + &generate_utf8_vec(SKIP_ITERATOR_SIZE), + ); a.append(&a_b).unwrap(); a }); impl_test_iter_skip!(utf8_iter_many_chunk_null_check_skip, 18, Some("0"), None, { - let mut a = StringChunked::new("test", &generate_opt_utf8_vec(SKIP_ITERATOR_SIZE)); - let a_b = StringChunked::new("test", &generate_opt_utf8_vec(SKIP_ITERATOR_SIZE)); + let mut a = StringChunked::new( + PlSmallStr::from_static("test"), + &generate_opt_utf8_vec(SKIP_ITERATOR_SIZE), + ); + let a_b = StringChunked::new( + PlSmallStr::from_static("test"), + &generate_opt_utf8_vec(SKIP_ITERATOR_SIZE), + ); a.append(&a_b).unwrap(); a }); @@ -987,23 +1043,41 @@ mod test { } impl_test_iter_skip!(bool_iter_single_chunk_skip, 8, Some(true), Some(false), { - BooleanChunked::from_slice("test", &generate_boolean_vec(SKIP_ITERATOR_SIZE)) + BooleanChunked::from_slice( + PlSmallStr::from_static("test"), + &generate_boolean_vec(SKIP_ITERATOR_SIZE), + ) }); impl_test_iter_skip!(bool_iter_single_chunk_null_check_skip, 8, None, None, { - BooleanChunked::new("test", &generate_opt_boolean_vec(SKIP_ITERATOR_SIZE)) + BooleanChunked::new( + PlSmallStr::from_static("test"), + &generate_opt_boolean_vec(SKIP_ITERATOR_SIZE), + ) }); impl_test_iter_skip!(bool_iter_many_chunk_skip, 18, Some(true), Some(false), { - let mut a = BooleanChunked::from_slice("test", &generate_boolean_vec(SKIP_ITERATOR_SIZE)); - let a_b = BooleanChunked::from_slice("test", &generate_boolean_vec(SKIP_ITERATOR_SIZE)); + let mut a = BooleanChunked::from_slice( + PlSmallStr::from_static("test"), + &generate_boolean_vec(SKIP_ITERATOR_SIZE), + ); + let a_b = BooleanChunked::from_slice( + PlSmallStr::from_static("test"), + &generate_boolean_vec(SKIP_ITERATOR_SIZE), + ); a.append(&a_b).unwrap(); a }); impl_test_iter_skip!(bool_iter_many_chunk_null_check_skip, 18, None, None, { - let mut a = BooleanChunked::new("test", &generate_opt_boolean_vec(SKIP_ITERATOR_SIZE)); - let a_b = BooleanChunked::new("test", &generate_opt_boolean_vec(SKIP_ITERATOR_SIZE)); + let mut a = BooleanChunked::new( + PlSmallStr::from_static("test"), + &generate_opt_boolean_vec(SKIP_ITERATOR_SIZE), + ); + let a_b = BooleanChunked::new( + PlSmallStr::from_static("test"), + &generate_opt_boolean_vec(SKIP_ITERATOR_SIZE), + ); a.append(&a_b).unwrap(); a }); diff --git a/crates/polars-core/src/chunked_array/iterator/par/list.rs b/crates/polars-core/src/chunked_array/iterator/par/list.rs index 63eb77753215..bd43033e39a3 100644 --- a/crates/polars-core/src/chunked_array/iterator/par/list.rs +++ b/crates/polars-core/src/chunked_array/iterator/par/list.rs @@ -4,8 +4,9 @@ use crate::prelude::*; unsafe fn idx_to_array(idx: usize, arr: &ListArray, dtype: &DataType) -> Option { if arr.is_valid(idx) { - Some(arr.value_unchecked(idx)) - .map(|arr: ArrayRef| Series::from_chunks_and_dtype_unchecked("", vec![arr], dtype)) + Some(arr.value_unchecked(idx)).map(|arr: ArrayRef| { + Series::from_chunks_and_dtype_unchecked(PlSmallStr::const_default(), vec![arr], dtype) + }) } else { None } diff --git a/crates/polars-core/src/chunked_array/list/iterator.rs b/crates/polars-core/src/chunked_array/list/iterator.rs index 19d6fc90c952..2303a7592f64 100644 --- a/crates/polars-core/src/chunked_array/list/iterator.rs +++ b/crates/polars-core/src/chunked_array/list/iterator.rs @@ -51,7 +51,7 @@ impl<'a, I: Iterator>> Iterator for AmortizedListIter<'a // dtype is known unsafe { let s = Series::from_chunks_and_dtype_unchecked( - "", + PlSmallStr::const_default(), vec![array_ref], &self.inner_dtype.to_physical(), ) @@ -69,7 +69,7 @@ impl<'a, I: Iterator>> Iterator for AmortizedListIter<'a { let (s, ptr) = unsafe { unstable_series_container_and_ptr( - self.series_container.name(), + self.series_container.name().clone(), array_ref, self.series_container.dtype(), ) @@ -123,13 +123,13 @@ impl ListChunked { /// If the returned `AmortSeries` is cloned, the local copy will be replaced and a new container /// will be set. pub fn amortized_iter(&self) -> AmortizedListIter> + '_> { - self.amortized_iter_with_name("") + self.amortized_iter_with_name(PlSmallStr::const_default()) } /// See `amortized_iter`. pub fn amortized_iter_with_name( &self, - name: &str, + name: PlSmallStr, ) -> AmortizedListIter> + '_> { // we create the series container from the inner array // so that the container has the proper dtype. @@ -172,7 +172,7 @@ impl ListChunked { V::Array: ArrayFromIter>, { // TODO! make an amortized iter that does not flatten - self.amortized_iter().map(f).collect_ca(self.name()) + self.amortized_iter().map(f).collect_ca(self.name().clone()) } pub fn try_apply_amortized_generic(&self, f: F) -> PolarsResult> @@ -182,7 +182,9 @@ impl ListChunked { V::Array: ArrayFromIter>, { // TODO! make an amortized iter that does not flatten - self.amortized_iter().map(f).try_collect_ca(self.name()) + self.amortized_iter() + .map(f) + .try_collect_ca(self.name().clone()) } pub fn for_each_amortized(&self, f: F) @@ -224,7 +226,7 @@ impl ListChunked { .collect_trusted() }; - out.rename(self.name()); + out.rename(self.name().clone()); if fast_explode { out.set_fast_explode(); } @@ -271,7 +273,7 @@ impl ListChunked { .collect_trusted() }; - out.rename(self.name()); + out.rename(self.name().clone()); if fast_explode { out.set_fast_explode(); } @@ -312,7 +314,7 @@ impl ListChunked { .collect::>()? }; - out.rename(self.name()); + out.rename(self.name().clone()); if fast_explode { out.set_fast_explode(); } @@ -343,7 +345,7 @@ impl ListChunked { .collect_trusted() }; - ca.rename(self.name()); + ca.rename(self.name().clone()); if fast_explode { ca.set_fast_explode(); } @@ -375,7 +377,7 @@ impl ListChunked { }) .collect::>()? }; - ca.rename(self.name()); + ca.rename(self.name().clone()); if fast_explode { ca.set_fast_explode(); } @@ -390,10 +392,17 @@ mod test { #[test] fn test_iter_list() { - let mut builder = get_list_builder(&DataType::Int32, 10, 10, "").unwrap(); - builder.append_series(&Series::new("", &[1, 2, 3])).unwrap(); - builder.append_series(&Series::new("", &[3, 2, 1])).unwrap(); - builder.append_series(&Series::new("", &[1, 1])).unwrap(); + let mut builder = + get_list_builder(&DataType::Int32, 10, 10, PlSmallStr::const_default()).unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[1, 2, 3])) + .unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[3, 2, 1])) + .unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[1, 1])) + .unwrap(); let ca = builder.finish(); ca.amortized_iter().zip(&ca).for_each(|(s1, s2)| { diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs index 81adf6a48d30..903fcf6cab14 100644 --- a/crates/polars-core/src/chunked_array/list/mod.rs +++ b/crates/polars-core/src/chunked_array/list/mod.rs @@ -41,7 +41,9 @@ impl ListChunked { let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect(); // SAFETY: Data type of arrays matches because they are chunks from the same array. - unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, self.inner_dtype()) } + unsafe { + Series::from_chunks_and_dtype_unchecked(self.name().clone(), chunks, self.inner_dtype()) + } } /// Returns an iterator over the offsets of this chunked array. @@ -76,7 +78,7 @@ impl ListChunked { // Inner dtype is passed correctly let elements = unsafe { Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![arr.values().clone()], ca.inner_dtype(), ) @@ -102,7 +104,7 @@ impl ListChunked { // SAFETY: arr's inner dtype is derived from out dtype. Ok(unsafe { ListChunked::from_chunks_and_dtype_unchecked( - ca.name(), + ca.name().clone(), vec![Box::new(arr)], DataType::List(Box::new(out.dtype().clone())), ) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs index d2b4ee332673..23739e71d965 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs @@ -13,7 +13,7 @@ struct KeyWrapper(u32); pub struct CategoricalChunkedBuilder { cat_builder: UInt32Vec, - name: String, + name: PlSmallStr, ordering: CategoricalOrdering, categories: MutablePlString, // hashmap utilized by the local builder @@ -21,10 +21,10 @@ pub struct CategoricalChunkedBuilder { } impl CategoricalChunkedBuilder { - pub fn new(name: &str, capacity: usize, ordering: CategoricalOrdering) -> Self { + pub fn new(name: PlSmallStr, capacity: usize, ordering: CategoricalOrdering) -> Self { Self { cat_builder: UInt32Vec::with_capacity(capacity), - name: name.to_string(), + name, ordering, categories: MutablePlString::with_capacity(_HASHMAP_INIT_SIZE), local_mapping: PlHashMap::with_capacity_and_hasher( @@ -166,7 +166,7 @@ impl CategoricalChunkedBuilder { ); let indices = std::mem::take(&mut self.cat_builder).into(); - let indices = UInt32Chunked::with_chunk(&self.name, indices); + let indices = UInt32Chunked::with_chunk(self.name.clone(), indices); // SAFETY: indices are in bounds of new rev_map unsafe { @@ -196,7 +196,7 @@ impl CategoricalChunkedBuilder { // SAFETY: keys and values are in bounds unsafe { CategoricalChunked::from_keys_and_values( - &self.name, + self.name.clone(), &self.cat_builder.into(), &self.categories.into(), self.ordering, @@ -271,7 +271,7 @@ impl CategoricalChunked { } pub(crate) unsafe fn from_keys_and_values_global( - name: &str, + name: PlSmallStr, keys: impl IntoIterator> + Send, capacity: usize, values: &Utf8ViewArray, @@ -317,7 +317,7 @@ impl CategoricalChunked { } pub(crate) unsafe fn from_keys_and_values_local( - name: &str, + name: PlSmallStr, keys: &PrimitiveArray, values: &Utf8ViewArray, ordering: CategoricalOrdering, @@ -333,7 +333,7 @@ impl CategoricalChunked { /// # Safety /// The caller must ensure that index values in the `keys` are in within bounds of the `values` length. pub(crate) unsafe fn from_keys_and_values( - name: &str, + name: PlSmallStr, keys: &PrimitiveArray, values: &Utf8ViewArray, ordering: CategoricalOrdering, @@ -372,8 +372,8 @@ impl CategoricalChunked { .map(|opt_s: Option<&str>| opt_s.and_then(|s| map.get(s).copied())) .collect_arr() }); - let mut keys: UInt32Chunked = ChunkedArray::from_chunk_iter(values.name(), iter); - keys.rename(values.name()); + let mut keys: UInt32Chunked = ChunkedArray::from_chunk_iter(values.name().clone(), iter); + keys.rename(values.name().clone()); let rev_map = RevMapping::build_local(categories.clone()); unsafe { Ok(CategoricalChunked::from_cats_and_rev_map_unchecked( @@ -403,7 +403,7 @@ mod test { Some("foo"), Some("bar"), ]; - let ca = StringChunked::new("a", slice); + let ca = StringChunked::new(PlSmallStr::from_static("a"), slice); let out = ca.cast(&DataType::Categorical(None, Default::default()))?; let out = out.categorical().unwrap().clone(); assert_eq!(out.get_rev_map().len(), 2); @@ -422,10 +422,10 @@ mod test { // Check that we don't panic if we append two categorical arrays // build under the same string cache // https://github.com/pola-rs/polars/issues/1115 - let ca1 = StringChunked::new("a", slice) + let ca1 = StringChunked::new(PlSmallStr::from_static("a"), slice) .cast(&DataType::Categorical(None, Default::default()))?; let mut ca1 = ca1.categorical().unwrap().clone(); - let ca2 = StringChunked::new("a", slice) + let ca2 = StringChunked::new(PlSmallStr::from_static("a"), slice) .cast(&DataType::Categorical(None, Default::default()))?; let ca2 = ca2.categorical().unwrap(); ca1.append(ca2).unwrap(); @@ -445,8 +445,16 @@ mod test { // Use 2 builders to check if the global string cache // does not interfere with the index mapping - let builder1 = CategoricalChunkedBuilder::new("foo", 10, Default::default()); - let builder2 = CategoricalChunkedBuilder::new("foo", 10, Default::default()); + let builder1 = CategoricalChunkedBuilder::new( + PlSmallStr::from_static("foo"), + 10, + Default::default(), + ); + let builder2 = CategoricalChunkedBuilder::new( + PlSmallStr::from_static("foo"), + 10, + Default::default(), + ); let s = builder1 .drain_iter_and_finish(vec![None, Some("hello"), Some("vietnam")]) .into_series(); diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 5539668ad740..503ddb0152aa 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -37,7 +37,7 @@ pub struct CategoricalChunked { impl CategoricalChunked { pub(crate) fn field(&self) -> Field { let name = self.physical().name(); - Field::new(name, self.dtype().clone()) + Field::new(name.clone(), self.dtype().clone()) } pub fn is_empty(&self) -> bool { @@ -54,7 +54,7 @@ impl CategoricalChunked { self.physical.null_count() } - pub fn name(&self) -> &str { + pub fn name(&self) -> &PlSmallStr { self.physical.name() } @@ -122,7 +122,7 @@ impl CategoricalChunked { // SAFETY: keys and values are in bounds unsafe { Ok(CategoricalChunked::from_keys_and_values_global( - self.name(), + self.name().clone(), self.physical(), self.len(), categories, @@ -337,7 +337,8 @@ impl LogicalType for CategoricalChunked { DataType::String => { let mapping = &**self.get_rev_map(); - let mut builder = StringChunkedBuilder::new(self.physical.name(), self.len()); + let mut builder = + StringChunkedBuilder::new(self.physical.name().clone(), self.len()); let f = |idx: u32| mapping.get(idx); @@ -356,7 +357,10 @@ impl LogicalType for CategoricalChunked { }, DataType::UInt32 => { let ca = unsafe { - UInt32Chunked::from_chunks(self.physical.name(), self.physical.chunks.clone()) + UInt32Chunked::from_chunks( + self.physical.name().clone(), + self.physical.chunks.clone(), + ) }; Ok(ca.into_series()) }, @@ -369,7 +373,7 @@ impl LogicalType for CategoricalChunked { .to_enum(categories, *hash) .set_ordering(*ordering, true) .into_series() - .with_name(self.name())) + .with_name(self.name().clone())) }, DataType::Enum(None, _) => { polars_bail!(ComputeError: "can not cast to enum without categories present") @@ -393,7 +397,7 @@ impl LogicalType for CategoricalChunked { dt if dt.is_numeric() => { // Apply the cast to the categories and then index into the casted series let categories = StringChunked::with_chunk( - self.physical.name(), + self.physical.name().clone(), self.get_rev_map().get_categories().clone(), ); let casted_series = categories.cast_with_options(dtype, options)?; @@ -460,12 +464,12 @@ mod test { Some("foo"), Some("bar"), ]; - let ca = StringChunked::new("a", slice); + let ca = StringChunked::new(PlSmallStr::from_static("a"), slice); let ca = ca.cast(&DataType::Categorical(None, Default::default()))?; let ca = ca.categorical().unwrap(); let arr = ca.to_arrow(CompatLevel::newest(), false); - let s = Series::try_from(("foo", arr))?; + let s = Series::try_from((PlSmallStr::from_static("foo"), arr))?; assert!(matches!(s.dtype(), &DataType::Categorical(_, _))); assert_eq!(s.null_count(), 1); assert_eq!(s.len(), 6); @@ -479,10 +483,10 @@ mod test { disable_string_cache(); enable_string_cache(); - let mut s1 = Series::new("1", vec!["a", "b", "c"]) + let mut s1 = Series::new(PlSmallStr::from_static("1"), vec!["a", "b", "c"]) .cast(&DataType::Categorical(None, Default::default())) .unwrap(); - let s2 = Series::new("2", vec!["a", "x", "y"]) + let s2 = Series::new(PlSmallStr::from_static("2"), vec!["a", "x", "y"]) .cast(&DataType::Categorical(None, Default::default())) .unwrap(); let appended = s1.append(&s2).unwrap(); @@ -495,13 +499,15 @@ mod test { #[test] fn test_fast_unique() { let _lock = SINGLE_LOCK.lock(); - let s = Series::new("1", vec!["a", "b", "c"]) + let s = Series::new(PlSmallStr::from_static("1"), vec!["a", "b", "c"]) .cast(&DataType::Categorical(None, Default::default())) .unwrap(); assert_eq!(s.n_unique().unwrap(), 3); // Make sure that it does not take the fast path after take/slice. - let out = s.take(&IdxCa::new("", [1, 2])).unwrap(); + let out = s + .take(&IdxCa::new(PlSmallStr::const_default(), [1, 2])) + .unwrap(); assert_eq!(out.n_unique().unwrap(), 2); let out = s.slice(1, 2); assert_eq!(out.n_unique().unwrap(), 2); @@ -513,12 +519,15 @@ mod test { disable_string_cache(); // tests several things that may lose the dtype information - let s = Series::new("a", vec!["a", "b", "c"]) + let s = Series::new(PlSmallStr::from_static("a"), vec!["a", "b", "c"]) .cast(&DataType::Categorical(None, Default::default()))?; assert_eq!( s.field().into_owned(), - Field::new("a", DataType::Categorical(None, Default::default())) + Field::new( + PlSmallStr::from_static("a"), + DataType::Categorical(None, Default::default()) + ) ); assert!(matches!( s.get(0)?, diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/full.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/full.rs index 959717155ce3..ed53722d163e 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/full.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/full.rs @@ -2,7 +2,7 @@ use super::*; impl CategoricalChunked { pub fn full_null( - name: &str, + name: PlSmallStr, is_enum: bool, length: usize, ordering: CategoricalOrdering, diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index 21a4bfb96a6a..a0f4a4ef90db 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -5,12 +5,14 @@ impl CategoricalChunked { let cat_map = self.get_rev_map(); if self._can_fast_unique() { let ca = match &**cat_map { - RevMapping::Local(a, _) => { - UInt32Chunked::from_iter_values(self.physical().name(), 0..(a.len() as u32)) - }, - RevMapping::Global(map, _, _) => { - UInt32Chunked::from_iter_values(self.physical().name(), map.keys().copied()) - }, + RevMapping::Local(a, _) => UInt32Chunked::from_iter_values( + self.physical().name().clone(), + 0..(a.len() as u32), + ), + RevMapping::Global(map, _, _) => UInt32Chunked::from_iter_values( + self.physical().name().clone(), + map.keys().copied(), + ), }; // SAFETY: // we only removed some indexes so we are still in bounds @@ -63,7 +65,7 @@ impl CategoricalChunked { *values.physical_mut() = physical_values; let mut counts = groups.group_count(); - counts.rename("counts"); + counts.rename(PlSmallStr::from_static("counts")); let cols = vec![values.into_series(), counts.into_series()]; let df = unsafe { DataFrame::new_no_checks(cols) }; df.sort( diff --git a/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs b/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs index be3c678b0f18..a0bd2687af63 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs @@ -5,7 +5,7 @@ use std::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; use hashbrown::hash_map::RawEntryMut; use once_cell::sync::Lazy; use polars_utils::aliases::PlRandomState; -use smartstring::{LazyCompact, SmartString}; +use polars_utils::pl_str::PlSmallStr; use crate::datatypes::{InitHashMaps2, PlIdHashMap}; use crate::hashing::_HASHMAP_INIT_SIZE; @@ -133,7 +133,7 @@ impl Hash for Key { pub(crate) struct SCacheInner { map: PlIdHashMap, pub(crate) uuid: u32, - payloads: Vec, + payloads: Vec, } impl SCacheInner { @@ -149,8 +149,8 @@ impl SCacheInner { #[inline] pub(crate) fn insert_from_hash(&mut self, h: u64, s: &str) -> u32 { let mut global_idx = self.payloads.len() as u32; - // Note that we don't create the StrHashGlobal to search the key in the hashmap - // as StrHashGlobal may allocate a string + // Note that we don't create the PlSmallStr to search the key in the hashmap + // as PlSmallStr may allocate a string let entry = self.map.raw_entry_mut().from_hash(h, |key| { (key.hash == h) && { let pos = key.idx as usize; @@ -169,7 +169,7 @@ impl SCacheInner { entry.insert_hashed_nocheck(h, key, ()); // only just now we allocate the string - self.payloads.push(s.into()); + self.payloads.push(PlSmallStr::from_str(s)); }, } global_idx @@ -178,7 +178,6 @@ impl SCacheInner { #[inline] pub(crate) fn get_cat(&self, s: &str) -> Option { let h = StringCache::get_hash_builder().hash_one(s); - // as StrHashGlobal may allocate a string self.map .raw_entry() .from_hash(h, |key| { @@ -254,5 +253,3 @@ impl StringCache { } pub(crate) static STRING_CACHE: Lazy = Lazy::new(Default::default); - -type StrHashGlobal = SmartString; diff --git a/crates/polars-core/src/chunked_array/logical/decimal.rs b/crates/polars-core/src/chunked_array/logical/decimal.rs index b8bd978fda0e..64134d5e62ad 100644 --- a/crates/polars-core/src/chunked_array/logical/decimal.rs +++ b/crates/polars-core/src/chunked_array/logical/decimal.rs @@ -104,7 +104,7 @@ impl LogicalType for DecimalChunked { let chunks = cast_chunks(&self.chunks, dtype, cast_options)?; unsafe { Ok(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), chunks, dtype, )) @@ -134,7 +134,8 @@ impl DecimalChunked { let dtype = DataType::Decimal(None, Some(scale)); let chunks = cast_chunks(&self.chunks, &dtype, CastOptions::NonStrict)?; - let mut dt = Self::new_logical(unsafe { Int128Chunked::from_chunks(self.name(), chunks) }); + let mut dt = + Self::new_logical(unsafe { Int128Chunked::from_chunks(self.name().clone(), chunks) }); dt.2 = Some(dtype); Ok(Cow::Owned(dt)) } diff --git a/crates/polars-core/src/chunked_array/logical/mod.rs b/crates/polars-core/src/chunked_array/logical/mod.rs index 8ed6dc4dae35..d4e6d4eb84aa 100644 --- a/crates/polars-core/src/chunked_array/logical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/mod.rs @@ -97,6 +97,6 @@ where } pub fn field(&self) -> Field { let name = self.0.ref_field().name(); - Field::new(name, LogicalType::dtype(self).clone()) + Field::new(name.clone(), LogicalType::dtype(self).clone()) } } diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 7cea5e54d2dc..cf10c3c17b42 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -603,11 +603,11 @@ impl ChunkedArray { } pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) { - self.field = Arc::new(Field::new(self.name(), dtype)) + self.field = Arc::new(Field::new(self.name().clone(), dtype)) } /// Name of the [`ChunkedArray`]. - pub fn name(&self) -> &str { + pub fn name(&self) -> &PlSmallStr { self.field.name() } @@ -617,12 +617,12 @@ impl ChunkedArray { } /// Rename this [`ChunkedArray`]. - pub fn rename(&mut self, name: &str) { + pub fn rename(&mut self, name: PlSmallStr) { self.field = Arc::new(Field::new(name, self.field.data_type().clone())) } /// Return this [`ChunkedArray`] with a new name. - pub fn with_name(mut self, name: &str) -> Self { + pub fn with_name(mut self, name: PlSmallStr) -> Self { self.rename(name); self } @@ -704,7 +704,7 @@ impl ListChunked { pub fn get_as_series(&self, idx: usize) -> Option { unsafe { Some(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![self.get(idx)?], &self.inner_dtype().to_physical(), )) @@ -718,7 +718,7 @@ impl ArrayChunked { pub fn get_as_series(&self, idx: usize) -> Option { unsafe { Some(Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![self.get(idx)?], &self.inner_dtype().to_physical(), )) @@ -754,7 +754,9 @@ where .collect(); // SAFETY: We just slice the original chunks, their type will not change. - unsafe { Self::from_chunks_and_dtype(self.name(), chunks, self.dtype().clone()) } + unsafe { + Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone()) + } }; if self.chunks.len() != 1 { @@ -949,7 +951,7 @@ pub(crate) fn to_array( impl Default for ChunkedArray { fn default() -> Self { ChunkedArray { - field: Arc::new(Field::new("default", DataType::Null)), + field: Arc::new(Field::new(PlSmallStr::const_default(), DataType::Null)), chunks: Default::default(), md: Arc::new(IMMetadata::default()), length: 0, @@ -963,19 +965,19 @@ pub(crate) mod test { use crate::prelude::*; pub(crate) fn get_chunked_array() -> Int32Chunked { - ChunkedArray::new("a", &[1, 2, 3]) + ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3]) } #[test] fn test_sort() { - let a = Int32Chunked::new("a", &[1, 9, 3, 2]); + let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]); let b = a .sort(false) .into_iter() .map(|opt| opt.unwrap()) .collect::>(); assert_eq!(b, [1, 2, 3, 9]); - let a = StringChunked::new("a", &["b", "a", "c"]); + let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]); let a = a.sort(false); let b = a.into_iter().collect::>(); assert_eq!(b, [Some("a"), Some("b"), Some("c")]); @@ -984,8 +986,8 @@ pub(crate) mod test { #[test] fn arithmetic() { - let a = &Int32Chunked::new("a", &[1, 100, 6, 40]); - let b = &Int32Chunked::new("b", &[-1, 2, 3, 4]); + let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]); + let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]); // Not really asserting anything here but still making sure the code is exercised // This (and more) is properly tested from the integration test suite and Python bindings. @@ -1014,7 +1016,10 @@ pub(crate) mod test { fn filter() { let a = get_chunked_array(); let b = a - .filter(&BooleanChunked::new("filter", &[true, false, false])) + .filter(&BooleanChunked::new( + PlSmallStr::from_static("filter"), + &[true, false, false], + )) .unwrap(); assert_eq!(b.len(), 1); assert_eq!(b.into_iter().next(), Some(Some(1))); @@ -1022,7 +1027,7 @@ pub(crate) mod test { #[test] fn aggregates() { - let a = &Int32Chunked::new("a", &[1, 100, 10, 9]); + let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]); assert_eq!(a.max(), Some(100)); assert_eq!(a.min(), Some(1)); assert_eq!(a.sum(), Some(120)) @@ -1051,8 +1056,8 @@ pub(crate) mod test { #[test] fn slice() { - let mut first = UInt32Chunked::new("first", &[0, 1, 2]); - let second = UInt32Chunked::new("second", &[3, 4, 5]); + let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]); + let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]); first.append(&second).unwrap(); assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]); assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]); @@ -1070,7 +1075,7 @@ pub(crate) mod test { #[test] fn sorting() { - let s = UInt32Chunked::new("", &[9, 2, 4]); + let s = UInt32Chunked::new(PlSmallStr::const_default(), &[9, 2, 4]); let sorted = s.sort(false); assert_slice_equal(&sorted, &[2, 4, 9]); let sorted = s.sort(true); @@ -1097,19 +1102,19 @@ pub(crate) mod test { #[test] fn reverse() { - let s = UInt32Chunked::new("", &[1, 2, 3]); + let s = UInt32Chunked::new(PlSmallStr::const_default(), &[1, 2, 3]); // path with continuous slice assert_slice_equal(&s.reverse(), &[3, 2, 1]); // path with options - let s = UInt32Chunked::new("", &[Some(1), None, Some(3)]); + let s = UInt32Chunked::new(PlSmallStr::const_default(), &[Some(1), None, Some(3)]); assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]); - let s = BooleanChunked::new("", &[true, false]); + let s = BooleanChunked::new(PlSmallStr::const_default(), &[true, false]); assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]); - let s = StringChunked::new("", &["a", "b", "c"]); + let s = StringChunked::new(PlSmallStr::const_default(), &["a", "b", "c"]); assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]); - let s = StringChunked::new("", &[Some("a"), None, Some("c")]); + let s = StringChunked::new(PlSmallStr::const_default(), &[Some("a"), None, Some("c")]); assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]); } @@ -1119,7 +1124,10 @@ pub(crate) mod test { use crate::{disable_string_cache, SINGLE_LOCK}; let _lock = SINGLE_LOCK.lock(); disable_string_cache(); - let ca = StringChunked::new("", &[Some("foo"), None, Some("bar"), Some("ham")]); + let ca = StringChunked::new( + PlSmallStr::const_default(), + &[Some("foo"), None, Some("bar"), Some("ham")], + ); let ca = ca .cast(&DataType::Categorical(None, Default::default())) .unwrap(); @@ -1131,7 +1139,7 @@ pub(crate) mod test { #[test] #[ignore] fn test_shrink_to_fit() { - let mut builder = StringChunkedBuilder::new("foo", 2048); + let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048); builder.append_value("foo"); let mut arr = builder.finish(); let before = arr diff --git a/crates/polars-core/src/chunked_array/ndarray.rs b/crates/polars-core/src/chunked_array/ndarray.rs index 9bff6d03f411..c1aeb6851aeb 100644 --- a/crates/polars-core/src/chunked_array/ndarray.rs +++ b/crates/polars-core/src/chunked_array/ndarray.rs @@ -83,8 +83,8 @@ impl DataFrame { /// /// ```rust /// use polars_core::prelude::*; - /// let a = UInt32Chunked::new("a", &[1, 2, 3]).into_series(); - /// let b = Float64Chunked::new("b", &[10., 8., 6.]).into_series(); + /// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_series(); + /// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_series(); /// /// let df = DataFrame::new(vec![a, b]).unwrap(); /// let ndarray = df.to_ndarray::(IndexOrder::Fortran).unwrap(); @@ -186,12 +186,16 @@ mod test { #[test] fn test_ndarray_from_ca() -> PolarsResult<()> { - let ca = Float64Chunked::new("", &[1.0, 2.0, 3.0]); + let ca = Float64Chunked::new(PlSmallStr::const_default(), &[1.0, 2.0, 3.0]); let ndarr = ca.to_ndarray()?; assert_eq!(ndarr, ArrayView1::from(&[1.0, 2.0, 3.0])); - let mut builder = - ListPrimitiveChunkedBuilder::::new("", 10, 10, DataType::Float64); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::const_default(), + 10, + 10, + DataType::Float64, + ); builder.append_opt_slice(Some(&[1.0, 2.0, 3.0])); builder.append_opt_slice(Some(&[2.0, 4.0, 5.0])); builder.append_opt_slice(Some(&[6.0, 7.0, 8.0])); @@ -202,8 +206,12 @@ mod test { assert_eq!(ndarr, expected); // test list array that is not square - let mut builder = - ListPrimitiveChunkedBuilder::::new("", 10, 10, DataType::Float64); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::const_default(), + 10, + 10, + DataType::Float64, + ); builder.append_opt_slice(Some(&[1.0, 2.0, 3.0])); builder.append_opt_slice(Some(&[2.0])); builder.append_opt_slice(Some(&[6.0, 7.0, 8.0])); diff --git a/crates/polars-core/src/chunked_array/object/builder.rs b/crates/polars-core/src/chunked_array/object/builder.rs index d54fb8c7dea6..2ec6debe03b3 100644 --- a/crates/polars-core/src/chunked_array/object/builder.rs +++ b/crates/polars-core/src/chunked_array/object/builder.rs @@ -12,7 +12,7 @@ impl ObjectChunkedBuilder where T: PolarsObject, { - pub fn new(name: &str, capacity: usize) -> Self { + pub fn new(name: PlSmallStr, capacity: usize) -> Self { ObjectChunkedBuilder { field: Field::new(name, DataType::Object(T::type_name(), None)), values: Vec::with_capacity(capacity), @@ -78,7 +78,7 @@ where /// Initialize a polars Object data type. The type has got information needed to /// construct new objects. pub(crate) fn get_object_type() -> DataType { - let object_builder = Box::new(|name: &str, capacity: usize| { + let object_builder = Box::new(|name: PlSmallStr, capacity: usize| { Box::new(ObjectChunkedBuilder::::new(name, capacity)) as Box }); @@ -94,7 +94,7 @@ where T: PolarsObject, { fn default() -> Self { - ObjectChunkedBuilder::new("", 0) + ObjectChunkedBuilder::new(PlSmallStr::const_default(), 0) } } @@ -102,11 +102,11 @@ impl NewChunkedArray, T> for ObjectChunked where T: PolarsObject, { - fn from_slice(name: &str, v: &[T]) -> Self { + fn from_slice(name: PlSmallStr, v: &[T]) -> Self { Self::from_iter_values(name, v.iter().cloned()) } - fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + fn from_slice_options(name: PlSmallStr, opt_v: &[Option]) -> Self { let mut builder = ObjectChunkedBuilder::::new(name, opt_v.len()); opt_v .iter() @@ -115,14 +115,17 @@ where builder.finish() } - fn from_iter_options(name: &str, it: impl Iterator>) -> ObjectChunked { + fn from_iter_options( + name: PlSmallStr, + it: impl Iterator>, + ) -> ObjectChunked { let mut builder = ObjectChunkedBuilder::new(name, get_iter_capacity(&it)); it.for_each(|opt| builder.append_option(opt)); builder.finish() } /// Create a new ChunkedArray from an iterator. - fn from_iter_values(name: &str, it: impl Iterator) -> ObjectChunked { + fn from_iter_values(name: PlSmallStr, it: impl Iterator) -> ObjectChunked { let mut builder = ObjectChunkedBuilder::new(name, get_iter_capacity(&it)); it.for_each(|v| builder.append_value(v)); builder.finish() @@ -133,7 +136,7 @@ impl ObjectChunked where T: PolarsObject, { - pub fn new_from_vec(name: &str, v: Vec) -> Self { + pub fn new_from_vec(name: PlSmallStr, v: Vec) -> Self { let field = Arc::new(Field::new(name, DataType::Object(T::type_name(), None))); let len = v.len(); let arr = Box::new(ObjectArray { @@ -146,7 +149,7 @@ where unsafe { ObjectChunked::new_with_dims(field, vec![arr], len as IdxSize, 0) } } - pub fn new_from_vec_and_validity(name: &str, v: Vec, validity: Bitmap) -> Self { + pub fn new_from_vec_and_validity(name: PlSmallStr, v: Vec, validity: Bitmap) -> Self { let field = Arc::new(Field::new(name, DataType::Object(T::type_name(), None))); let len = v.len(); let null_count = validity.unset_bits(); @@ -162,7 +165,7 @@ where } } - pub fn new_empty(name: &str) -> Self { + pub fn new_empty(name: PlSmallStr) -> Self { Self::new_from_vec(name, vec![]) } } diff --git a/crates/polars-core/src/chunked_array/object/extension/list.rs b/crates/polars-core/src/chunked_array/object/extension/list.rs index fb4ea6d73a2c..e6ab34ceb4c8 100644 --- a/crates/polars-core/src/chunked_array/object/extension/list.rs +++ b/crates/polars-core/src/chunked_array/object/extension/list.rs @@ -6,7 +6,7 @@ use crate::prelude::*; impl ObjectChunked { pub(crate) fn get_list_builder( - name: &str, + name: PlSmallStr, values_capacity: usize, list_capacity: usize, ) -> Box { @@ -25,7 +25,7 @@ struct ExtensionListBuilder { } impl ExtensionListBuilder { - pub(crate) fn new(name: &str, values_capacity: usize, list_capacity: usize) -> Self { + pub(crate) fn new(name: PlSmallStr, values_capacity: usize, list_capacity: usize) -> Self { let mut offsets = Vec::with_capacity(list_capacity + 1); offsets.push(0); Self { @@ -80,7 +80,7 @@ impl ListBuilderTrait for ExtensionListBuilder { None, ); - let mut listarr = ListChunked::with_chunk(ca.name(), arr); + let mut listarr = ListChunked::with_chunk(ca.name().clone(), arr); if self.fast_explode { listarr.set_fast_explode() } diff --git a/crates/polars-core/src/chunked_array/object/extension/mod.rs b/crates/polars-core/src/chunked_array/object/extension/mod.rs index 51916297de91..f461364fbf13 100644 --- a/crates/polars-core/src/chunked_array/object/extension/mod.rs +++ b/crates/polars-core/src/chunked_array/object/extension/mod.rs @@ -9,6 +9,7 @@ use arrow::array::FixedSizeBinaryArray; use arrow::bitmap::MutableBitmap; use arrow::buffer::Buffer; use polars_extension::PolarsExtension; +use polars_utils::format_pl_smallstr; use crate::prelude::*; use crate::PROCESS_ID; @@ -39,9 +40,9 @@ unsafe fn create_drop(mut ptr: *const u8, n_t_vals: usize) -> Box>, - // A function on the heap that take a `array: FixedSizeBinary` and a `name: &str` + // A function on the heap that take a `array: FixedSizeBinary` and a `name: PlSmallStr` // and returns a `Series` of `ObjectChunked` - pub(crate) to_series_fn: Option Series>>, + pub(crate) to_series_fn: Option Series>>, } impl Drop for ExtensionSentinel { @@ -120,11 +121,14 @@ pub(crate) fn create_extension> + TrustedLen, T: Si let et_ptr = &*et as *const ExtensionSentinel; std::mem::forget(et); - let metadata = format!("{};{}", *PROCESS_ID, et_ptr as usize); + let metadata = format_pl_smallstr!("{};{}", *PROCESS_ID, et_ptr as usize); let physical_type = ArrowDataType::FixedSizeBinary(t_size); - let extension_type = - ArrowDataType::Extension(EXTENSION_NAME.into(), physical_type.into(), Some(metadata)); + let extension_type = ArrowDataType::Extension( + PlSmallStr::from_static(EXTENSION_NAME), + physical_type.into(), + Some(metadata), + ); // first freeze, otherwise we compute null let validity = if null_count > 0 { Some(validity.into()) @@ -217,7 +221,7 @@ mod test { }; let values = &[Some(foo1), None, Some(foo2), None]; - let ca = ObjectChunked::new("", values); + let ca = ObjectChunked::new(PlSmallStr::const_default(), values); let groups = GroupsProxy::Idx(vec![(0, unitvec![0, 1]), (2, unitvec![2]), (3, unitvec![3])].into()); @@ -241,7 +245,7 @@ mod test { }; let values = &[Some(foo1.clone()), None, Some(foo2.clone()), None]; - let ca = ObjectChunked::new("", values); + let ca = ObjectChunked::new(PlSmallStr::const_default(), values); let groups = vec![(0, unitvec![0, 1]), (2, unitvec![2]), (3, unitvec![3])].into(); let out = unsafe { ca.agg_list(&GroupsProxy::Idx(groups)) }; diff --git a/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs index 6030f668dfe1..6302d9846769 100644 --- a/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs +++ b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs @@ -13,7 +13,11 @@ impl PolarsExtension { let arr = arr.slice_typed_unchecked(i, 1); let pe = Self::new(arr); let pe = ManuallyDrop::new(pe); - pe.get_series("").get(0).unwrap().into_static().unwrap() + pe.get_series(&PlSmallStr::const_default()) + .get(0) + .unwrap() + .into_static() + .unwrap() } pub(crate) unsafe fn new(array: FixedSizeBinaryArray) -> Self { @@ -57,7 +61,7 @@ impl PolarsExtension { /// Calls the heap allocated function in the `[ExtensionSentinel]` that knows /// how to convert the `[FixedSizeBinaryArray]` to a `Series` of type `[ObjectChunked]` - pub(crate) unsafe fn get_series(&self, name: &str) -> Series { + pub(crate) unsafe fn get_series(&self, name: &PlSmallStr) -> Series { self.with_sentinel(|sent| { (sent.to_series_fn.as_ref().unwrap())(self.array.as_ref().unwrap(), name) }) @@ -66,7 +70,7 @@ impl PolarsExtension { // heap allocates a function that converts the binary array to a Series of `[ObjectChunked]` // the `name` will be the `name` of the output `Series` when this function is called (later). pub(crate) unsafe fn set_to_series_fn(&mut self) { - let f = Box::new(move |arr: &FixedSizeBinaryArray, name: &str| { + let f = Box::new(move |arr: &FixedSizeBinaryArray, name: &PlSmallStr| { let iter = arr.iter().map(|opt| { opt.map(|bytes| { let t = std::ptr::read_unaligned(bytes.as_ptr() as *const T); @@ -77,7 +81,7 @@ impl PolarsExtension { }) }); - let ca = ObjectChunked::::from_iter_options(name, iter); + let ca = ObjectChunked::::from_iter_options(name.clone(), iter); ca.into_series() }); self.with_sentinel(move |sent| { diff --git a/crates/polars-core/src/chunked_array/object/registry.rs b/crates/polars-core/src/chunked_array/object/registry.rs index ef5febddad76..e84c7ab69ba5 100644 --- a/crates/polars-core/src/chunked_array/object/registry.rs +++ b/crates/polars-core/src/chunked_array/object/registry.rs @@ -9,6 +9,7 @@ use std::sync::{Arc, RwLock}; use arrow::datatypes::ArrowDataType; use once_cell::sync::Lazy; +use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::object::builder::ObjectChunkedBuilder; use crate::datatypes::AnyValue; @@ -17,7 +18,7 @@ use crate::series::{IntoSeries, Series}; /// Takes a `name` and `capacity` and constructs a new builder. pub type BuilderConstructor = - Box Box + Send + Sync>; + Box Box + Send + Sync>; pub type ObjectConverter = Arc Box + Send + Sync>; pub struct ObjectRegistry { @@ -116,7 +117,7 @@ pub fn get_object_physical_type() -> ArrowDataType { reg.physical_dtype.clone() } -pub fn get_object_builder(name: &str, capacity: usize) -> Box { +pub fn get_object_builder(name: PlSmallStr, capacity: usize) -> Box { let reg = GLOBAL_OBJECT_REGISTRY.read().unwrap(); let reg = reg.as_ref().unwrap(); (reg.builder_constructor)(name, capacity) diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index b6848d80b652..803fe7a5ba78 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -636,9 +636,9 @@ mod test { // Validated with numpy. Note that numpy uses ddof as an argument which // influences results. The default ddof=0, we chose ddof=1, which is // standard in statistics. - let ca1 = Int32Chunked::new("", &[5, 8, 9, 5, 0]); + let ca1 = Int32Chunked::new(PlSmallStr::const_default(), &[5, 8, 9, 5, 0]); let ca2 = Int32Chunked::new( - "", + PlSmallStr::const_default(), &[ Some(5), None, @@ -660,11 +660,11 @@ mod test { #[test] fn test_agg_float() { - let ca1 = Float32Chunked::new("a", &[1.0, f32::NAN]); - let ca2 = Float32Chunked::new("b", &[f32::NAN, 1.0]); + let ca1 = Float32Chunked::new(PlSmallStr::from_static("a"), &[1.0, f32::NAN]); + let ca2 = Float32Chunked::new(PlSmallStr::from_static("b"), &[f32::NAN, 1.0]); assert_eq!(ca1.min(), ca2.min()); - let ca1 = Float64Chunked::new("a", &[1.0, f64::NAN]); - let ca2 = Float64Chunked::from_slice("b", &[f64::NAN, 1.0]); + let ca1 = Float64Chunked::new(PlSmallStr::from_static("a"), &[1.0, f64::NAN]); + let ca2 = Float64Chunked::from_slice(PlSmallStr::from_static("b"), &[f64::NAN, 1.0]); assert_eq!(ca1.min(), ca2.min()); println!("{:?}", (ca1.min(), ca2.min())) } @@ -672,12 +672,12 @@ mod test { #[test] fn test_median() { let ca = UInt32Chunked::new( - "a", + PlSmallStr::from_static("a"), &[Some(2), Some(1), None, Some(3), Some(5), None, Some(4)], ); assert_eq!(ca.median(), Some(3.0)); let ca = UInt32Chunked::new( - "a", + PlSmallStr::from_static("a"), &[ None, Some(7), @@ -694,7 +694,7 @@ mod test { assert_eq!(ca.median(), Some(4.0)); let ca = Float32Chunked::from_slice( - "", + PlSmallStr::const_default(), &[ 0.166189, 0.166559, 0.168517, 0.169393, 0.175272, 0.233167, 0.238787, 0.266562, 0.26903, 0.285792, 0.292801, 0.293429, 0.301706, 0.308534, 0.331489, 0.346095, @@ -707,7 +707,7 @@ mod test { #[test] fn test_mean() { - let ca = Float32Chunked::new("", &[Some(1.0), Some(2.0), None]); + let ca = Float32Chunked::new(PlSmallStr::const_default(), &[Some(1.0), Some(2.0), None]); assert_eq!(ca.mean().unwrap(), 1.5); assert_eq!( ca.into_series() @@ -718,7 +718,7 @@ mod test { 1.5 ); // all null values case - let ca = Float32Chunked::full_null("", 3); + let ca = Float32Chunked::full_null(PlSmallStr::const_default(), 3); assert_eq!(ca.mean(), None); assert_eq!( ca.into_series().mean_reduce().value().extract::(), @@ -728,10 +728,14 @@ mod test { #[test] fn test_quantile_all_null() { - let test_f32 = Float32Chunked::from_slice_options("", &[None, None, None]); - let test_i32 = Int32Chunked::from_slice_options("", &[None, None, None]); - let test_f64 = Float64Chunked::from_slice_options("", &[None, None, None]); - let test_i64 = Int64Chunked::from_slice_options("", &[None, None, None]); + let test_f32 = + Float32Chunked::from_slice_options(PlSmallStr::const_default(), &[None, None, None]); + let test_i32 = + Int32Chunked::from_slice_options(PlSmallStr::const_default(), &[None, None, None]); + let test_f64 = + Float64Chunked::from_slice_options(PlSmallStr::const_default(), &[None, None, None]); + let test_i64 = + Int64Chunked::from_slice_options(PlSmallStr::const_default(), &[None, None, None]); let interpol_options = vec![ QuantileInterpolOptions::Nearest, @@ -751,10 +755,12 @@ mod test { #[test] fn test_quantile_single_value() { - let test_f32 = Float32Chunked::from_slice_options("", &[Some(1.0)]); - let test_i32 = Int32Chunked::from_slice_options("", &[Some(1)]); - let test_f64 = Float64Chunked::from_slice_options("", &[Some(1.0)]); - let test_i64 = Int64Chunked::from_slice_options("", &[Some(1)]); + let test_f32 = + Float32Chunked::from_slice_options(PlSmallStr::const_default(), &[Some(1.0)]); + let test_i32 = Int32Chunked::from_slice_options(PlSmallStr::const_default(), &[Some(1)]); + let test_f64 = + Float64Chunked::from_slice_options(PlSmallStr::const_default(), &[Some(1.0)]); + let test_i64 = Int64Chunked::from_slice_options(PlSmallStr::const_default(), &[Some(1)]); let interpol_options = vec![ QuantileInterpolOptions::Nearest, @@ -774,14 +780,22 @@ mod test { #[test] fn test_quantile_min_max() { - let test_f32 = - Float32Chunked::from_slice_options("", &[None, Some(1f32), Some(5f32), Some(1f32)]); - let test_i32 = - Int32Chunked::from_slice_options("", &[None, Some(1i32), Some(5i32), Some(1i32)]); - let test_f64 = - Float64Chunked::from_slice_options("", &[None, Some(1f64), Some(5f64), Some(1f64)]); - let test_i64 = - Int64Chunked::from_slice_options("", &[None, Some(1i64), Some(5i64), Some(1i64)]); + let test_f32 = Float32Chunked::from_slice_options( + PlSmallStr::const_default(), + &[None, Some(1f32), Some(5f32), Some(1f32)], + ); + let test_i32 = Int32Chunked::from_slice_options( + PlSmallStr::const_default(), + &[None, Some(1i32), Some(5i32), Some(1i32)], + ); + let test_f64 = Float64Chunked::from_slice_options( + PlSmallStr::const_default(), + &[None, Some(1f64), Some(5f64), Some(1f64)], + ); + let test_i64 = Int64Chunked::from_slice_options( + PlSmallStr::const_default(), + &[None, Some(1i64), Some(5i64), Some(1i64)], + ); let interpol_options = vec![ QuantileInterpolOptions::Nearest, @@ -822,7 +836,7 @@ mod test { #[test] fn test_quantile() { let ca = UInt32Chunked::new( - "a", + PlSmallStr::from_static("a"), &[Some(2), Some(1), None, Some(3), Some(5), None, Some(4)], ); @@ -896,7 +910,7 @@ mod test { ); let ca = UInt32Chunked::new( - "a", + PlSmallStr::from_static("a"), &[ None, Some(7), diff --git a/crates/polars-core/src/chunked_array/ops/any_value.rs b/crates/polars-core/src/chunked_array/ops/any_value.rs index f9a959e582a2..9064b0a0aa7f 100644 --- a/crates/polars-core/src/chunked_array/ops/any_value.rs +++ b/crates/polars-core/src/chunked_array/ops/any_value.rs @@ -49,12 +49,20 @@ pub(crate) unsafe fn arr_to_any_value<'a>( DataType::List(dt) => { let v: ArrayRef = downcast!(LargeListArray); if dt.is_primitive() { - let s = Series::from_chunks_and_dtype_unchecked("", vec![v], dt); + let s = Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![v], + dt, + ); AnyValue::List(s) } else { - let s = Series::from_chunks_and_dtype_unchecked("", vec![v], &dt.to_physical()) - .cast_unchecked(dt) - .unwrap(); + let s = Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![v], + &dt.to_physical(), + ) + .cast_unchecked(dt) + .unwrap(); AnyValue::List(s) } }, @@ -62,12 +70,20 @@ pub(crate) unsafe fn arr_to_any_value<'a>( DataType::Array(dt, width) => { let v: ArrayRef = downcast!(FixedSizeListArray); if dt.is_primitive() { - let s = Series::from_chunks_and_dtype_unchecked("", vec![v], dt); + let s = Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![v], + dt, + ); AnyValue::Array(s, *width) } else { - let s = Series::from_chunks_and_dtype_unchecked("", vec![v], &dt.to_physical()) - .cast_unchecked(dt) - .unwrap(); + let s = Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![v], + &dt.to_physical(), + ) + .cast_unchecked(dt) + .unwrap(); AnyValue::Array(s, *width) } }, diff --git a/crates/polars-core/src/chunked_array/ops/append.rs b/crates/polars-core/src/chunked_array/ops/append.rs index 35b7a352c8aa..383c76d63600 100644 --- a/crates/polars-core/src/chunked_array/ops/append.rs +++ b/crates/polars-core/src/chunked_array/ops/append.rs @@ -155,7 +155,7 @@ where impl ListChunked { pub fn append(&mut self, other: &Self) -> PolarsResult<()> { let dtype = merge_dtypes(self.dtype(), other.dtype())?; - self.field = Arc::new(Field::new(self.name(), dtype)); + self.field = Arc::new(Field::new(self.name().clone(), dtype)); let len = self.len(); self.length = self @@ -177,7 +177,7 @@ impl ListChunked { impl ArrayChunked { pub fn append(&mut self, other: &Self) -> PolarsResult<()> { let dtype = merge_dtypes(self.dtype(), other.dtype())?; - self.field = Arc::new(Field::new(self.name(), dtype)); + self.field = Arc::new(Field::new(self.name().clone(), dtype)); let len = self.len(); @@ -198,7 +198,7 @@ impl ArrayChunked { impl StructChunked { pub fn append(&mut self, other: &Self) -> PolarsResult<()> { let dtype = merge_dtypes(self.dtype(), other.dtype())?; - self.field = Arc::new(Field::new(self.name(), dtype)); + self.field = Arc::new(Field::new(self.name().clone(), dtype)); let len = self.len(); diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index de62f1eddc7f..4e2d16687f99 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -37,7 +37,7 @@ where } }); - ChunkedArray::from_chunk_iter(self.name(), iter) + ChunkedArray::from_chunk_iter(self.name().clone(), iter) } /// Applies a function only to the non-null elements, propagating nulls. @@ -64,7 +64,7 @@ where Ok(arr) }); - ChunkedArray::try_from_chunk_iter(self.name(), iter) + ChunkedArray::try_from_chunk_iter(self.name().clone(), iter) } pub fn apply_into_string_amortized<'a, F>(&'a self, mut f: F) -> StringChunked @@ -87,7 +87,7 @@ where mutarr.freeze() }) .collect::>(); - ChunkedArray::from_chunk_iter(self.name(), chunks) + ChunkedArray::from_chunk_iter(self.name().clone(), chunks) } pub fn try_apply_into_string_amortized<'a, F, E>(&'a self, mut f: F) -> Result @@ -112,11 +112,11 @@ where Ok(mutarr.freeze()) }) .collect::>(); - ChunkedArray::try_from_chunk_iter(self.name(), chunks) + ChunkedArray::try_from_chunk_iter(self.name().clone(), chunks) } } -fn apply_in_place_impl(name: &str, chunks: Vec, f: F) -> ChunkedArray +fn apply_in_place_impl(name: PlSmallStr, chunks: Vec, f: F) -> ChunkedArray where F: Fn(S::Native) -> S::Native + Copy, S: PolarsNumericType, @@ -170,7 +170,7 @@ impl ChunkedArray { .unwrap(); s.chunks().clone() }; - apply_in_place_impl(self.name(), chunks, f) + apply_in_place_impl(self.name().clone(), chunks, f) } /// Cast a numeric array to another numeric data type and apply a function in place. @@ -180,7 +180,7 @@ impl ChunkedArray { F: Fn(T::Native) -> T::Native + Copy, { let chunks = std::mem::take(&mut self.chunks); - apply_in_place_impl(self.name(), chunks, f) + apply_in_place_impl(self.name().clone(), chunks, f) } } @@ -217,7 +217,7 @@ where let arr: T::Array = slice.iter().copied().map(f).collect_arr(); arr.with_validity(validity.cloned()) }); - ChunkedArray::from_chunk_iter(self.name(), chunks) + ChunkedArray::from_chunk_iter(self.name().clone(), chunks) } fn apply(&'a self, f: F) -> Self @@ -228,7 +228,7 @@ where let iter = arr.into_iter().map(|opt_v| f(opt_v.copied())); PrimitiveArray::::from_trusted_len_iter(iter) }); - Self::from_chunk_iter(self.name(), chunks) + Self::from_chunk_iter(self.name().clone(), chunks) } fn apply_to_slice(&'a self, f: F, slice: &mut [V]) @@ -312,7 +312,7 @@ impl StringChunked { let new = Utf8ViewArray::arr_from_iter(iter); new.with_validity(arr.validity().cloned()) }); - StringChunked::from_chunk_iter(self.name(), chunks) + StringChunked::from_chunk_iter(self.name().clone(), chunks) } } @@ -326,7 +326,7 @@ impl BinaryChunked { let new = BinaryViewArray::arr_from_iter(iter); new.with_validity(arr.validity().cloned()) }); - BinaryChunked::from_chunk_iter(self.name(), chunks) + BinaryChunked::from_chunk_iter(self.name().clone(), chunks) } } @@ -405,7 +405,7 @@ impl<'a> ChunkApply<'a, &'a [u8]> for BinaryChunked { impl ChunkApplyKernel for BooleanChunked { fn apply_kernel(&self, f: &dyn Fn(&BooleanArray) -> ArrayRef) -> Self { let chunks = self.downcast_iter().map(f).collect(); - unsafe { Self::from_chunks(self.name(), chunks) } + unsafe { Self::from_chunks(self.name().clone(), chunks) } } fn apply_kernel_cast(&self, f: &dyn Fn(&BooleanArray) -> ArrayRef) -> ChunkedArray @@ -413,7 +413,7 @@ impl ChunkApplyKernel for BooleanChunked { S: PolarsDataType, { let chunks = self.downcast_iter().map(f).collect(); - unsafe { ChunkedArray::::from_chunks(self.name(), chunks) } + unsafe { ChunkedArray::::from_chunks(self.name().clone(), chunks) } } } @@ -432,7 +432,7 @@ where S: PolarsDataType, { let chunks = self.downcast_iter().map(f).collect(); - unsafe { ChunkedArray::from_chunks(self.name(), chunks) } + unsafe { ChunkedArray::from_chunks(self.name().clone(), chunks) } } } @@ -446,7 +446,7 @@ impl ChunkApplyKernel for StringChunked { S: PolarsDataType, { let chunks = self.downcast_iter().map(f).collect(); - unsafe { ChunkedArray::from_chunks(self.name(), chunks) } + unsafe { ChunkedArray::from_chunks(self.name().clone(), chunks) } } } @@ -460,7 +460,7 @@ impl ChunkApplyKernel for BinaryChunked { S: PolarsDataType, { let chunks = self.downcast_iter().map(f).collect(); - unsafe { ChunkedArray::from_chunks(self.name(), chunks) } + unsafe { ChunkedArray::from_chunks(self.name().clone(), chunks) } } } @@ -519,7 +519,9 @@ impl<'a> ChunkApply<'a, Series> for ListChunked { let mut idx = 0; self.downcast_iter().for_each(|arr| { arr.iter().for_each(|opt_val| { - let opt_val = opt_val.map(|arrayref| Series::try_from(("", arrayref)).unwrap()); + let opt_val = opt_val.map(|arrayref| { + Series::try_from((PlSmallStr::const_default(), arrayref)).unwrap() + }); // SAFETY: // length asserted above @@ -543,7 +545,7 @@ where F: Fn(&'a T) -> T + Copy, { let mut ca: ObjectChunked = self.into_iter().map(|opt_v| opt_v.map(f)).collect(); - ca.rename(self.name()); + ca.rename(self.name().clone()); ca } @@ -552,7 +554,7 @@ where F: Fn(Option<&'a T>) -> Option + Copy, { let mut ca: ObjectChunked = self.into_iter().map(f).collect(); - ca.rename(self.name()); + ca.rename(self.name().clone()); ca } diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs index c69ae14a5866..774b6fba6755 100644 --- a/crates/polars-core/src/chunked_array/ops/arity.rs +++ b/crates/polars-core/src/chunked_array/ops/arity.rs @@ -3,6 +3,7 @@ use std::error::Error; use arrow::array::{Array, MutablePlString, StaticArray}; use arrow::compute::utils::combine_validities_and; use polars_error::PolarsResult; +use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::metadata::MetadataProperties; use crate::datatypes::{ArrayCollectIterExt, ArrayFromIter}; @@ -49,7 +50,7 @@ where F: FnMut(&T::Array) -> Arr, { let iter = ca.downcast_iter().map(op); - ChunkedArray::from_chunk_iter(ca.name(), iter) + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) } /// Applies a kernel that produces `Array` types. @@ -61,9 +62,9 @@ where Arr: Array, F: FnMut(T::Array) -> Arr, { - let name = ca.name().to_owned(); + let name = ca.name().clone(); let iter = ca.downcast_into_iter().map(op); - ChunkedArray::from_chunk_iter(&name, iter) + ChunkedArray::from_chunk_iter(name, iter) } #[inline] @@ -78,12 +79,12 @@ where let iter = ca .downcast_iter() .map(|arr| arr.iter().map(&mut op).collect_arr()); - ChunkedArray::from_chunk_iter(ca.name(), iter) + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) } else { let iter = ca .downcast_iter() .map(|arr| arr.values_iter().map(|x| op(Some(x))).collect_arr()); - ChunkedArray::from_chunk_iter(ca.name(), iter) + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) } } @@ -101,7 +102,7 @@ where let iter = ca .downcast_iter() .map(|arr| arr.iter().map(&mut op).try_collect_arr()); - ChunkedArray::try_from_chunk_iter(ca.name(), iter) + ChunkedArray::try_from_chunk_iter(ca.name().clone(), iter) } #[inline] @@ -114,7 +115,7 @@ where { if ca.null_count() == ca.len() { let arr = V::Array::full_null(ca.len(), V::get_dtype().to_arrow(CompatLevel::newest())); - return ChunkedArray::with_chunk(ca.name(), arr); + return ChunkedArray::with_chunk(ca.name().clone(), arr); } let iter = ca.downcast_iter().map(|arr| { @@ -122,7 +123,7 @@ where let arr: V::Array = arr.values_iter().map(&mut op).collect_arr(); arr.with_validity_typed(validity) }); - ChunkedArray::from_chunk_iter(ca.name(), iter) + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) } #[inline] @@ -138,7 +139,7 @@ where { if ca.null_count() == ca.len() { let arr = V::Array::full_null(ca.len(), V::get_dtype().to_arrow(CompatLevel::newest())); - return Ok(ChunkedArray::with_chunk(ca.name(), arr)); + return Ok(ChunkedArray::with_chunk(ca.name().clone(), arr)); } let iter = ca.downcast_iter().map(|arr| { @@ -146,7 +147,7 @@ where let arr: V::Array = arr.values_iter().map(&mut op).try_collect_arr()?; Ok(arr.with_validity_typed(validity)) }); - ChunkedArray::try_from_chunk_iter(ca.name(), iter) + ChunkedArray::try_from_chunk_iter(ca.name().clone(), iter) } /// Applies a kernel that produces `Array` types. @@ -164,7 +165,7 @@ where let iter = ca .downcast_iter() .map(|arr| op(arr).with_validity_typed(arr.validity().cloned())); - ChunkedArray::from_chunk_iter(ca.name(), iter) + ChunkedArray::from_chunk_iter(ca.name().clone(), iter) } /// Applies a kernel that produces `Array` types. @@ -176,7 +177,7 @@ where Arr: Array + StaticArray, F: FnMut(&T::Array) -> Arr, { - ChunkedArray::from_chunk_iter(ca.name(), ca.downcast_iter().map(op)) + ChunkedArray::from_chunk_iter(ca.name().clone(), ca.downcast_iter().map(op)) } #[inline] @@ -191,7 +192,7 @@ where F: FnMut(&T::Array) -> Result, E: Error, { - ChunkedArray::try_from_chunk_iter(ca.name(), ca.downcast_iter().map(op)) + ChunkedArray::try_from_chunk_iter(ca.name().clone(), ca.downcast_iter().map(op)) } #[inline] @@ -220,7 +221,7 @@ where .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)); element_iter.collect_arr() }); - ChunkedArray::from_chunk_iter(lhs.name(), iter) + ChunkedArray::from_chunk_iter(lhs.name().clone(), iter) } #[inline] @@ -297,7 +298,7 @@ where .map(|(lhs_opt_val, rhs_opt_val)| op(lhs_opt_val, rhs_opt_val)); element_iter.try_collect_arr() }); - ChunkedArray::try_from_chunk_iter(lhs.name(), iter) + ChunkedArray::try_from_chunk_iter(lhs.name().clone(), iter) } #[inline] @@ -317,7 +318,7 @@ where let len = lhs.len().min(rhs.len()); let arr = V::Array::full_null(len, V::get_dtype().to_arrow(CompatLevel::newest())); - return ChunkedArray::with_chunk(lhs.name(), arr); + return ChunkedArray::with_chunk(lhs.name().clone(), arr); } let (lhs, rhs) = align_chunks_binary(lhs, rhs); @@ -336,7 +337,7 @@ where let array: V::Array = element_iter.collect_arr(); array.with_validity_typed(validity) }); - ChunkedArray::from_chunk_iter(lhs.name(), iter) + ChunkedArray::from_chunk_iter(lhs.name().clone(), iter) } /// Apply elementwise binary function which produces string, amortising allocations. @@ -373,7 +374,7 @@ where }); mutarr.freeze() }); - ChunkedArray::from_chunk_iter(lhs.name(), iter) + ChunkedArray::from_chunk_iter(lhs.name().clone(), iter) } /// Applies a kernel that produces `Array` types. @@ -385,7 +386,7 @@ pub fn binary_mut_values( lhs: &ChunkedArray, rhs: &ChunkedArray, mut op: F, - name: &str, + name: PlSmallStr, ) -> ChunkedArray where T: PolarsDataType, @@ -413,7 +414,7 @@ pub fn binary_mut_with_options( lhs: &ChunkedArray, rhs: &ChunkedArray, mut op: F, - name: &str, + name: PlSmallStr, ) -> ChunkedArray where T: PolarsDataType, @@ -435,7 +436,7 @@ pub fn try_binary_mut_with_options( lhs: &ChunkedArray, rhs: &ChunkedArray, mut op: F, - name: &str, + name: PlSmallStr, ) -> Result, E> where T: PolarsDataType, @@ -466,7 +467,7 @@ where Arr: Array, F: FnMut(&T::Array, &U::Array) -> Arr, { - binary_mut_with_options(lhs, rhs, op, lhs.name()) + binary_mut_with_options(lhs, rhs, op, lhs.name().clone()) } /// Applies a kernel that produces `Array` types. @@ -482,13 +483,13 @@ where Arr: Array, F: FnMut(L::Array, R::Array) -> Arr, { - let name = lhs.name().to_owned(); + let name = lhs.name().clone(); let (lhs, rhs) = align_chunks_binary_owned(lhs, rhs); let iter = lhs .downcast_into_iter() .zip(rhs.downcast_into_iter()) .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)); - ChunkedArray::from_chunk_iter(&name, iter) + ChunkedArray::from_chunk_iter(name, iter) } /// Applies a kernel that produces `Array` types. @@ -510,7 +511,7 @@ where .downcast_iter() .zip(rhs.downcast_iter()) .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)); - ChunkedArray::try_from_chunk_iter(lhs.name(), iter) + ChunkedArray::try_from_chunk_iter(lhs.name().clone(), iter) } /// Applies a kernel that produces `ArrayRef` of the same type. @@ -566,7 +567,7 @@ where .zip(rhs.downcast_iter()) .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr)) .collect::>(); - Series::try_from((lhs.name(), chunks)) + Series::try_from((lhs.name().clone(), chunks)) } /// Applies a kernel that produces `ArrayRef` of the same type. @@ -636,7 +637,7 @@ where ); element_iter.try_collect_arr() }); - ChunkedArray::try_from_chunk_iter(ca1.name(), iter) + ChunkedArray::try_from_chunk_iter(ca1.name().clone(), iter) } #[inline] @@ -677,7 +678,7 @@ where ); element_iter.collect_arr() }); - ChunkedArray::from_chunk_iter(ca1.name(), iter) + ChunkedArray::from_chunk_iter(ca1.name().clone(), iter) } pub fn broadcast_binary_elementwise( @@ -697,7 +698,7 @@ where match (lhs.len(), rhs.len()) { (1, _) => { let a = unsafe { lhs.get_unchecked(0) }; - unary_elementwise(rhs, |b| op(a.clone(), b)).with_name(lhs.name()) + unary_elementwise(rhs, |b| op(a.clone(), b)).with_name(lhs.name().clone()) }, (_, 1) => { let b = unsafe { rhs.get_unchecked(0) }; @@ -722,7 +723,7 @@ where match (lhs.len(), rhs.len()) { (1, _) => { let a = unsafe { lhs.get_unchecked(0) }; - Ok(try_unary_elementwise(rhs, |b| op(a.clone(), b))?.with_name(lhs.name())) + Ok(try_unary_elementwise(rhs, |b| op(a.clone(), b))?.with_name(lhs.name().clone())) }, (_, 1) => { let b = unsafe { rhs.get_unchecked(0) }; @@ -750,13 +751,13 @@ where let len = if min == 1 { max } else { min }; let arr = V::Array::full_null(len, V::get_dtype().to_arrow(CompatLevel::newest())); - return ChunkedArray::with_chunk(lhs.name(), arr); + return ChunkedArray::with_chunk(lhs.name().clone(), arr); } match (lhs.len(), rhs.len()) { (1, _) => { let a = unsafe { lhs.value_unchecked(0) }; - unary_elementwise_values(rhs, |b| op(a.clone(), b)).with_name(lhs.name()) + unary_elementwise_values(rhs, |b| op(a.clone(), b)).with_name(lhs.name().clone()) }, (_, 1) => { let b = unsafe { rhs.value_unchecked(0) }; @@ -793,7 +794,7 @@ where lhs.len(), O::get_dtype().to_arrow(CompatLevel::newest()), ); - ChunkedArray::::with_chunk(lhs.name(), arr) + ChunkedArray::::with_chunk(lhs.name().clone(), arr) }, Some(rhs) => unary_kernel(lhs, |arr| rhs_broadcast_kernel(arr, rhs.clone())), } @@ -806,14 +807,14 @@ where rhs.len(), O::get_dtype().to_arrow(CompatLevel::newest()), ); - ChunkedArray::::with_chunk(lhs.name(), arr) + ChunkedArray::::with_chunk(lhs.name().clone(), arr) }, Some(lhs) => unary_kernel(rhs, |arr| lhs_broadcast_kernel(lhs.clone(), arr)), } }, _ => panic!("Cannot apply operation on arrays of different lengths"), }; - out.with_name(name) + out.with_name(name.clone()) } pub fn apply_binary_kernel_broadcast_owned( @@ -843,7 +844,7 @@ where lhs.len(), O::get_dtype().to_arrow(CompatLevel::newest()), ); - ChunkedArray::::with_chunk(lhs.name(), arr) + ChunkedArray::::with_chunk(lhs.name().clone(), arr) }, Some(rhs) => unary_kernel_owned(lhs, |arr| rhs_broadcast_kernel(arr, rhs.clone())), } @@ -856,12 +857,12 @@ where rhs.len(), O::get_dtype().to_arrow(CompatLevel::newest()), ); - ChunkedArray::::with_chunk(lhs.name(), arr) + ChunkedArray::::with_chunk(lhs.name().clone(), arr) }, Some(lhs) => unary_kernel_owned(rhs, |arr| lhs_broadcast_kernel(lhs.clone(), arr)), } }, _ => panic!("Cannot apply operation on arrays of different lengths"), }; - out.with_name(&name) + out.with_name(name) } diff --git a/crates/polars-core/src/chunked_array/ops/bit_repr.rs b/crates/polars-core/src/chunked_array/ops/bit_repr.rs index 9a2f1c33594a..7b20d77e2444 100644 --- a/crates/polars-core/src/chunked_array/ops/bit_repr.rs +++ b/crates/polars-core/src/chunked_array/ops/bit_repr.rs @@ -20,7 +20,7 @@ fn reinterpret_chunked_array( PrimitiveArray::from_data_default(reinterpreted_buf, array.validity().cloned()) }); - ChunkedArray::from_chunk_iter(ca.name(), chunks) + ChunkedArray::from_chunk_iter(ca.name().clone(), chunks) } /// Reinterprets the type of a [`ListChunked`]. T and U must have the same size @@ -53,7 +53,7 @@ fn reinterpret_list_chunked( ) }); - ListChunked::from_chunk_iter(ca.name(), chunks) + ListChunked::from_chunk_iter(ca.name().clone(), chunks) } #[cfg(all(feature = "reinterpret", feature = "dtype-i16", feature = "dtype-u16"))] diff --git a/crates/polars-core/src/chunked_array/ops/chunkops.rs b/crates/polars-core/src/chunked_array/ops/chunkops.rs index d97af95367e6..aad1157a25d9 100644 --- a/crates/polars-core/src/chunked_array/ops/chunkops.rs +++ b/crates/polars-core/src/chunked_array/ops/chunkops.rs @@ -363,7 +363,7 @@ impl ObjectChunked { if self.chunks.len() == 1 { self.clone() } else { - let mut builder = ObjectChunkedBuilder::new(self.name(), self.len()); + let mut builder = ObjectChunkedBuilder::new(self.name().clone(), self.len()); let chunks = self.downcast_iter(); // todo! use iterators once implemented @@ -398,7 +398,7 @@ mod test { #[test] #[cfg(feature = "dtype-categorical")] fn test_categorical_map_after_rechunk() { - let s = Series::new("", &["foo", "bar", "spam"]); + let s = Series::new(PlSmallStr::const_default(), &["foo", "bar", "spam"]); let mut a = s .cast(&DataType::Categorical(None, Default::default())) .unwrap(); diff --git a/crates/polars-core/src/chunked_array/ops/decimal.rs b/crates/polars-core/src/chunked_array/ops/decimal.rs index e2f9c5845429..5f242ee37caa 100644 --- a/crates/polars-core/src/chunked_array/ops/decimal.rs +++ b/crates/polars-core/src/chunked_array/ops/decimal.rs @@ -43,7 +43,7 @@ mod test { "5.104", "5.25251525353", ]; - let s = StringChunked::from_slice("test", &vals); + let s = StringChunked::from_slice(PlSmallStr::from_str("test"), &vals); let s = s.to_decimal(6).unwrap(); assert_eq!(s.dtype(), &DataType::Decimal(None, Some(5))); assert_eq!(s.len(), 7); diff --git a/crates/polars-core/src/chunked_array/ops/explode.rs b/crates/polars-core/src/chunked_array/ops/explode.rs index b44ee0863a98..11af60734f71 100644 --- a/crates/polars-core/src/chunked_array/ops/explode.rs +++ b/crates/polars-core/src/chunked_array/ops/explode.rs @@ -1,16 +1,9 @@ use arrow::array::*; use arrow::bitmap::utils::set_bit_unchecked; use arrow::bitmap::{Bitmap, MutableBitmap}; -use arrow::legacy::array::list::AnonymousBuilder; -#[cfg(feature = "dtype-array")] -use arrow::legacy::is_valid::IsValid; use arrow::legacy::prelude::*; -use arrow::legacy::trusted_len::TrustedLenPush; use polars_utils::slice::GetSaferUnchecked; -#[cfg(feature = "dtype-array")] -use crate::chunked_array::builder::get_fixed_size_list_builder; -use crate::chunked_array::metadata::MetadataProperties; use crate::prelude::*; use crate::series::implementations::null::NullChunked; @@ -154,18 +147,24 @@ where new_values.into(), Some(validity.into()), ); - Series::try_from((self.name(), Box::new(arr) as ArrayRef)).unwrap() + Series::try_from((self.name().clone(), Box::new(arr) as ArrayRef)).unwrap() } } impl ExplodeByOffsets for Float32Chunked { fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.apply_as_ints(|s| s.explode_by_offsets(offsets)) + self.apply_as_ints(|s| { + let ca = s.u32().unwrap(); + ca.explode_by_offsets(offsets) + }) } } impl ExplodeByOffsets for Float64Chunked { fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.apply_as_ints(|s| s.explode_by_offsets(offsets)) + self.apply_as_ints(|s| { + let ca = s.u64().unwrap(); + ca.explode_by_offsets(offsets) + }) } } @@ -190,7 +189,7 @@ impl ExplodeByOffsets for BooleanChunked { let arr = self.downcast_iter().next().unwrap(); let cap = get_capacity(offsets); - let mut builder = BooleanChunkedBuilder::new(self.name(), cap); + let mut builder = BooleanChunkedBuilder::new(self.name().clone(), cap); let mut start = offsets[0] as usize; let mut last = start; @@ -225,166 +224,6 @@ impl ExplodeByOffsets for BooleanChunked { } } -impl ExplodeByOffsets for ListChunked { - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - debug_assert_eq!(self.chunks.len(), 1); - let arr = self.downcast_iter().next().unwrap(); - - let cap = get_capacity(offsets); - let inner_type = self.inner_dtype(); - - let mut builder = arrow::legacy::array::list::AnonymousBuilder::new(cap); - let mut owned = Vec::with_capacity(cap); - let mut start = offsets[0] as usize; - let mut last = start; - - let mut process_range = |start: usize, last: usize, builder: &mut AnonymousBuilder<'_>| { - let vals = arr.slice_typed(start, last - start); - for opt_arr in vals.into_iter() { - match opt_arr { - None => builder.push_null(), - Some(arr) => { - unsafe { - // we create a pointer to evade the bck - let ptr = arr.as_ref() as *const dyn Array; - // SAFETY: we preallocated - owned.push_unchecked(arr); - // SAFETY: the pointer is still valid as `owned` will not reallocate - builder.push(&*ptr as &dyn Array); - } - }, - } - } - }; - - for &o in &offsets[1..] { - let o = o as usize; - if o == last { - if start != last { - process_range(start, last, &mut builder); - } - builder.push_null(); - start = o; - } - last = o; - } - process_range(start, last, &mut builder); - let arr = builder - .finish(Some(&inner_type.to_arrow(CompatLevel::newest()))) - .unwrap(); - let mut ca = unsafe { self.copy_with_chunks(vec![Box::new(arr)]) }; - - use MetadataProperties as P; - ca.copy_metadata(self, P::SORTED | P::FAST_EXPLODE_LIST); - - ca.into_series() - } -} - -#[cfg(feature = "dtype-array")] -impl ExplodeByOffsets for ArrayChunked { - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - debug_assert_eq!(self.chunks.len(), 1); - let arr = self.downcast_iter().next().unwrap(); - - let cap = get_capacity(offsets); - let inner_type = self.inner_dtype(); - let mut builder = - get_fixed_size_list_builder(inner_type, cap, self.width(), self.name()).unwrap(); - - let mut start = offsets[0] as usize; - let mut last = start; - for &o in &offsets[1..] { - let o = o as usize; - if o == last { - if start != last { - let array = arr.slice_typed(start, last - start); - let values = array.values().as_ref(); - - for i in 0..array.len() { - unsafe { - if array.is_valid_unchecked(i) { - builder.push_unchecked(values, i) - } else { - builder.push_null() - } - } - } - } - unsafe { - builder.push_null(); - } - start = o; - } - last = o; - } - let array = arr.slice_typed(start, last - start); - let values = array.values().as_ref(); - for i in 0..array.len() { - unsafe { - if array.is_valid_unchecked(i) { - builder.push_unchecked(values, i) - } else { - builder.push_null() - } - } - } - - builder.finish().into() - } -} - -impl ExplodeByOffsets for StringChunked { - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - unsafe { - self.as_binary() - .explode_by_offsets(offsets) - .cast_unchecked(&DataType::String) - .unwrap() - } - } -} - -impl ExplodeByOffsets for BinaryChunked { - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - debug_assert_eq!(self.chunks.len(), 1); - let arr = self.downcast_iter().next().unwrap(); - - let cap = get_capacity(offsets); - let mut builder = BinaryChunkedBuilder::new(self.name(), cap); - - let mut start = offsets[0] as usize; - let mut last = start; - for &o in &offsets[1..] { - let o = o as usize; - if o == last { - if start != last { - let vals = arr.slice_typed(start, last - start); - if vals.null_count() == 0 { - builder - .chunk_builder - .extend_trusted_len_values(vals.values_iter()) - } else { - builder.chunk_builder.extend_trusted_len(vals.into_iter()); - } - } - builder.append_null(); - start = o; - } - last = o; - } - let vals = arr.slice_typed(start, last - start); - if vals.null_count() == 0 { - builder - .chunk_builder - .extend_trusted_len_values(vals.values_iter()) - } else { - builder.chunk_builder.extend_trusted_len(vals.into_iter()); - } - builder.finish().into() - } -} - /// Convert Arrow array offsets to indexes of the original list pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec { if offsets.is_empty() { @@ -430,13 +269,17 @@ mod test { #[test] fn test_explode_list() -> PolarsResult<()> { - let mut builder = get_list_builder(&DataType::Int32, 5, 5, "a")?; + let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"))?; builder - .append_series(&Series::new("", &[1, 2, 3, 3])) + .append_series(&Series::new(PlSmallStr::const_default(), &[1, 2, 3, 3])) + .unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[1])) + .unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[2])) .unwrap(); - builder.append_series(&Series::new("", &[1])).unwrap(); - builder.append_series(&Series::new("", &[2])).unwrap(); let ca = builder.finish(); assert!(ca._can_fast_explode()); @@ -454,41 +297,21 @@ mod test { Ok(()) } - #[test] - fn test_explode_list_nulls() -> PolarsResult<()> { - let ca = Int32Chunked::from_slice_options("", &[None, Some(1), Some(2)]); - let offsets = &[0, 3, 3]; - let out = ca.explode_by_offsets(offsets); - assert_eq!( - Vec::from(out.i32().unwrap()), - &[None, Some(1), Some(2), None] - ); - - let ca = BooleanChunked::from_slice_options("", &[None, Some(true), Some(false)]); - let out = ca.explode_by_offsets(offsets); - assert_eq!( - Vec::from(out.bool().unwrap()), - &[None, Some(true), Some(false), None] - ); - - let ca = StringChunked::from_slice_options("", &[None, Some("b"), Some("c")]); - let out = ca.explode_by_offsets(offsets); - assert_eq!( - Vec::from(out.str().unwrap()), - &[None, Some("b"), Some("c"), None] - ); - Ok(()) - } - #[test] fn test_explode_empty_list_slot() -> PolarsResult<()> { // primitive - let mut builder = get_list_builder(&DataType::Int32, 5, 5, "a")?; - builder.append_series(&Series::new("", &[1i32, 2])).unwrap(); + let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"))?; + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[1i32, 2])) + .unwrap(); + builder + .append_series( + &Int32Chunked::from_slice(PlSmallStr::const_default(), &[]).into_series(), + ) + .unwrap(); builder - .append_series(&Int32Chunked::from_slice("", &[]).into_series()) + .append_series(&Series::new(PlSmallStr::const_default(), &[3i32])) .unwrap(); - builder.append_series(&Series::new("", &[3i32])).unwrap(); let ca = builder.finish(); let exploded = ca.explode()?; @@ -498,16 +321,26 @@ mod test { ); // more primitive - let mut builder = get_list_builder(&DataType::Int32, 5, 5, "a")?; - builder.append_series(&Series::new("", &[1i32])).unwrap(); + let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"))?; builder - .append_series(&Int32Chunked::from_slice("", &[]).into_series()) + .append_series(&Series::new(PlSmallStr::const_default(), &[1i32])) .unwrap(); - builder.append_series(&Series::new("", &[2i32])).unwrap(); builder - .append_series(&Int32Chunked::from_slice("", &[]).into_series()) + .append_series( + &Int32Chunked::from_slice(PlSmallStr::const_default(), &[]).into_series(), + ) + .unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[2i32])) + .unwrap(); + builder + .append_series( + &Int32Chunked::from_slice(PlSmallStr::const_default(), &[]).into_series(), + ) + .unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[3, 4i32])) .unwrap(); - builder.append_series(&Series::new("", &[3, 4i32])).unwrap(); let ca = builder.finish(); let exploded = ca.explode()?; @@ -517,26 +350,41 @@ mod test { ); // string - let mut builder = get_list_builder(&DataType::String, 5, 5, "a")?; - builder.append_series(&Series::new("", &["abc"])).unwrap(); + let mut builder = get_list_builder(&DataType::String, 5, 5, PlSmallStr::from_static("a"))?; + builder + .append_series(&Series::new(PlSmallStr::const_default(), &["abc"])) + .unwrap(); builder .append_series( - &>::from_slice("", &[]) - .into_series(), + &>::from_slice( + PlSmallStr::const_default(), + &[], + ) + .into_series(), ) .unwrap(); - builder.append_series(&Series::new("", &["de"])).unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &["de"])) + .unwrap(); builder .append_series( - &>::from_slice("", &[]) - .into_series(), + &>::from_slice( + PlSmallStr::const_default(), + &[], + ) + .into_series(), ) .unwrap(); - builder.append_series(&Series::new("", &["fg"])).unwrap(); + builder + .append_series(&Series::new(PlSmallStr::const_default(), &["fg"])) + .unwrap(); builder .append_series( - &>::from_slice("", &[]) - .into_series(), + &>::from_slice( + PlSmallStr::const_default(), + &[], + ) + .into_series(), ) .unwrap(); @@ -548,17 +396,25 @@ mod test { ); // boolean - let mut builder = get_list_builder(&DataType::Boolean, 5, 5, "a")?; - builder.append_series(&Series::new("", &[true])).unwrap(); + let mut builder = get_list_builder(&DataType::Boolean, 5, 5, PlSmallStr::from_static("a"))?; + builder + .append_series(&Series::new(PlSmallStr::const_default(), &[true])) + .unwrap(); builder - .append_series(&BooleanChunked::from_slice("", &[]).into_series()) + .append_series( + &BooleanChunked::from_slice(PlSmallStr::const_default(), &[]).into_series(), + ) .unwrap(); - builder.append_series(&Series::new("", &[false])).unwrap(); builder - .append_series(&BooleanChunked::from_slice("", &[]).into_series()) + .append_series(&Series::new(PlSmallStr::const_default(), &[false])) + .unwrap(); + builder + .append_series( + &BooleanChunked::from_slice(PlSmallStr::const_default(), &[]).into_series(), + ) .unwrap(); builder - .append_series(&Series::new("", &[true, true])) + .append_series(&Series::new(PlSmallStr::const_default(), &[true, true])) .unwrap(); let ca = builder.finish(); diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index f335e3074665..8b1b87cbdaf8 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -3,6 +3,51 @@ use arrow::offset::OffsetsBuffer; use super::*; +impl ListChunked { + fn specialized( + &self, + values: ArrayRef, + offsets: &[i64], + offsets_buf: OffsetsBuffer, + ) -> (Series, OffsetsBuffer) { + // SAFETY: inner_dtype should be correct + let values = unsafe { + Series::from_chunks_and_dtype_unchecked( + self.name().clone(), + vec![values], + &self.inner_dtype().to_physical(), + ) + }; + + use crate::chunked_array::ops::explode::ExplodeByOffsets; + + let mut values = match values.dtype() { + DataType::Boolean => { + let t = values.bool().unwrap(); + ExplodeByOffsets::explode_by_offsets(t, offsets).into_series() + }, + DataType::Null => { + let t = values.null().unwrap(); + ExplodeByOffsets::explode_by_offsets(t, offsets).into_series() + }, + dtype => { + with_match_physical_numeric_polars_type!(dtype, |$T| { + let t: &ChunkedArray<$T> = values.as_ref().as_ref(); + ExplodeByOffsets::explode_by_offsets(t, offsets).into_series() + }) + }, + }; + + // let mut values = values.explode_by_offsets(offsets); + // restore logical type + unsafe { + values = values.cast_unchecked(self.inner_dtype()).unwrap(); + } + + (values, offsets_buf) + } +} + impl ChunkExplode for ListChunked { fn offsets(&self) -> PolarsResult> { let ca = self.rechunk(); @@ -40,7 +85,7 @@ impl ChunkExplode for ListChunked { ( unsafe { Series::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![values], &self.inner_dtype().to_physical(), ) @@ -64,16 +109,36 @@ impl ChunkExplode for ListChunked { panic!("could have fast exploded") } } - if listarr.null_count() == 0 { - // SAFETY: inner_dtype should be correct - let values = unsafe { - Series::from_chunks_and_dtype_unchecked( - self.name(), - vec![values], - &self.inner_dtype().to_physical(), - ) - }; - (values.explode_by_offsets(offsets), offsets_buf) + let (indices, new_offsets) = if listarr.null_count() == 0 { + // SPECIALIZED path. + let inner_phys = self.inner_dtype().to_physical(); + if inner_phys.is_numeric() || inner_phys.is_null() || inner_phys.is_bool() { + return Ok(self.specialized(values, offsets, offsets_buf)); + } + // Use gather + let mut indices = + MutablePrimitiveArray::::with_capacity(*offsets_buf.last() as usize); + let mut new_offsets = Vec::with_capacity(listarr.len() + 1); + let mut current_offset = 0i64; + let mut iter = offsets.iter(); + if let Some(mut previous) = iter.next().copied() { + new_offsets.push(current_offset); + iter.for_each(|&offset| { + let len = offset - previous; + let start = previous as IdxSize; + let end = offset as IdxSize; + + if len == 0 { + indices.push_null(); + } else { + indices.extend_trusted_len_values(start..end); + } + current_offset += len; + previous = offset; + new_offsets.push(current_offset); + }) + } + (indices, new_offsets) } else { // we have already ensure that validity is not none. let validity = listarr.validity().unwrap(); @@ -105,20 +170,22 @@ impl ChunkExplode for ListChunked { new_offsets.push(current_offset); }) } - // SAFETY: the indices we generate are in bounds - let chunk = unsafe { take_unchecked(values.as_ref(), &indices.into()) }; - // SAFETY: inner_dtype should be correct - let s = unsafe { - Series::from_chunks_and_dtype_unchecked( - self.name(), - vec![chunk], - &self.inner_dtype().to_physical(), - ) - }; - // SAFETY: monotonically increasing - let new_offsets = unsafe { OffsetsBuffer::new_unchecked(new_offsets.into()) }; - (s, new_offsets) - } + (indices, new_offsets) + }; + + // SAFETY: the indices we generate are in bounds + let chunk = unsafe { take_unchecked(values.as_ref(), &indices.into()) }; + // SAFETY: inner_dtype should be correct + let s = unsafe { + Series::from_chunks_and_dtype_unchecked( + self.name().clone(), + vec![chunk], + &self.inner_dtype().to_physical(), + ) + }; + // SAFETY: monotonically increasing + let new_offsets = unsafe { OffsetsBuffer::new_unchecked(new_offsets.into()) }; + (s, new_offsets) }; debug_assert_eq!(s.name(), self.name()); // restore logical type @@ -177,7 +244,7 @@ impl ChunkExplode for ArrayChunked { let arr = ca.downcast_iter().next().unwrap(); // fast-path for non-null array. if arr.null_count() == 0 { - let s = Series::try_from((self.name(), arr.values().clone())) + let s = Series::try_from((self.name().clone(), arr.values().clone())) .unwrap() .cast(ca.inner_dtype())?; let width = self.width() as i64; @@ -224,7 +291,11 @@ impl ChunkExplode for ArrayChunked { Ok(( // SAFETY: inner_dtype should be correct unsafe { - Series::from_chunks_and_dtype_unchecked(ca.name(), vec![chunk], ca.inner_dtype()) + Series::from_chunks_and_dtype_unchecked( + ca.name().clone(), + vec![chunk], + ca.inner_dtype(), + ) }, offsets, )) diff --git a/crates/polars-core/src/chunked_array/ops/extend.rs b/crates/polars-core/src/chunked_array/ops/extend.rs index db8c8923302d..9489c425d3ff 100644 --- a/crates/polars-core/src/chunked_array/ops/extend.rs +++ b/crates/polars-core/src/chunked_array/ops/extend.rs @@ -197,9 +197,9 @@ mod test { let mut values = Vec::with_capacity(32); values.extend_from_slice(&[1, 2, 3]); - let mut ca = Int32Chunked::from_vec("a", values); + let mut ca = Int32Chunked::from_vec(PlSmallStr::from_static("a"), values); let location = ca.cont_slice().unwrap().as_ptr() as usize; - let to_append = Int32Chunked::new("a", &[4, 5, 6]); + let to_append = Int32Chunked::new(PlSmallStr::from_static("a"), &[4, 5, 6]); ca.extend(&to_append)?; let location2 = ca.cont_slice().unwrap().as_ptr() as usize; @@ -218,8 +218,8 @@ mod test { #[test] fn test_extend_string() -> PolarsResult<()> { - let mut ca = StringChunked::new("a", &["a", "b", "c"]); - let to_append = StringChunked::new("a", &["a", "b", "e"]); + let mut ca = StringChunked::new(PlSmallStr::from_static("a"), &["a", "b", "c"]); + let to_append = StringChunked::new(PlSmallStr::from_static("a"), &["a", "b", "e"]); ca.extend(&to_append)?; assert_eq!(ca.len(), 6); @@ -231,8 +231,8 @@ mod test { #[test] fn test_extend_bool() -> PolarsResult<()> { - let mut ca = BooleanChunked::new("a", [true, false]); - let to_append = BooleanChunked::new("a", &[false, false]); + let mut ca = BooleanChunked::new(PlSmallStr::from_static("a"), [true, false]); + let to_append = BooleanChunked::new(PlSmallStr::from_static("a"), &[false, false]); ca.extend(&to_append)?; assert_eq!(ca.len(), 4); diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs index 470215efe1bc..7aa348d5e440 100644 --- a/crates/polars-core/src/chunked_array/ops/fill_null.rs +++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs @@ -30,7 +30,7 @@ impl Series { /// ```rust /// # use polars_core::prelude::*; /// fn example() -> PolarsResult<()> { - /// let s = Series::new("some_missing", &[Some(1), None, Some(2)]); + /// let s = Series::new("some_missing".into(), &[Some(1), None, Some(2)]); /// /// let filled = s.fill_null(FillNullStrategy::Forward(None))?; /// assert_eq!(Vec::from(filled.i32()?), &[Some(1), Some(1), Some(2)]); @@ -219,7 +219,7 @@ where FillNullStrategy::Forward(_) => unreachable!(), FillNullStrategy::Backward(_) => unreachable!(), }; - out.rename(ca.name()); + out.rename(ca.name().clone()); Ok(out) } diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs index 2fb493f9e9c3..a927f3c6cd99 100644 --- a/crates/polars-core/src/chunked_array/ops/filter.rs +++ b/crates/polars-core/src/chunked_array/ops/filter.rs @@ -190,12 +190,12 @@ where if filter.len() == 1 { return match filter.get(0) { Some(true) => Ok(self.clone()), - _ => Ok(ObjectChunked::new_empty(self.name())), + _ => Ok(ObjectChunked::new_empty(self.name().clone())), }; } check_filter_len!(self, filter); let chunks = self.downcast_iter().collect::>(); - let mut builder = ObjectChunkedBuilder::::new(self.name(), self.len()); + let mut builder = ObjectChunkedBuilder::::new(self.name().clone(), self.len()); for (idx, mask) in filter.into_iter().enumerate() { if mask.unwrap_or(false) { let (chunk_idx, idx) = self.index_to_chunked_index(idx); diff --git a/crates/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs index 790e7a23e6ed..dbe8290956dd 100644 --- a/crates/polars-core/src/chunked_array/ops/full.rs +++ b/crates/polars-core/src/chunked_array/ops/full.rs @@ -8,7 +8,7 @@ impl ChunkFull for ChunkedArray where T: PolarsNumericType, { - fn full(name: &str, value: T::Native, length: usize) -> Self { + fn full(name: PlSmallStr, value: T::Native, length: usize) -> Self { let data = vec![value; length]; let mut out = ChunkedArray::from_vec(name, data); out.set_sorted_flag(IsSorted::Ascending); @@ -20,13 +20,13 @@ impl ChunkFullNull for ChunkedArray where T: PolarsNumericType, { - fn full_null(name: &str, length: usize) -> Self { + fn full_null(name: PlSmallStr, length: usize) -> Self { let arr = PrimitiveArray::new_null(T::get_dtype().to_arrow(CompatLevel::newest()), length); ChunkedArray::with_chunk(name, arr) } } impl ChunkFull for BooleanChunked { - fn full(name: &str, value: bool, length: usize) -> Self { + fn full(name: PlSmallStr, value: bool, length: usize) -> Self { let mut bits = MutableBitmap::with_capacity(length); bits.extend_constant(length, value); let arr = BooleanArray::from_data_default(bits.into(), None); @@ -37,14 +37,14 @@ impl ChunkFull for BooleanChunked { } impl ChunkFullNull for BooleanChunked { - fn full_null(name: &str, length: usize) -> Self { + fn full_null(name: PlSmallStr, length: usize) -> Self { let arr = BooleanArray::new_null(ArrowDataType::Boolean, length); ChunkedArray::with_chunk(name, arr) } } impl<'a> ChunkFull<&'a str> for StringChunked { - fn full(name: &str, value: &'a str, length: usize) -> Self { + fn full(name: PlSmallStr, value: &'a str, length: usize) -> Self { let mut builder = StringChunkedBuilder::new(name, length); builder.chunk_builder.extend_constant(length, Some(value)); let mut out = builder.finish(); @@ -54,14 +54,14 @@ impl<'a> ChunkFull<&'a str> for StringChunked { } impl ChunkFullNull for StringChunked { - fn full_null(name: &str, length: usize) -> Self { + fn full_null(name: PlSmallStr, length: usize) -> Self { let arr = Utf8ViewArray::new_null(DataType::String.to_arrow(CompatLevel::newest()), length); ChunkedArray::with_chunk(name, arr) } } impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { - fn full(name: &str, value: &'a [u8], length: usize) -> Self { + fn full(name: PlSmallStr, value: &'a [u8], length: usize) -> Self { let mut builder = BinaryChunkedBuilder::new(name, length); builder.chunk_builder.extend_constant(length, Some(value)); let mut out = builder.finish(); @@ -71,7 +71,7 @@ impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { } impl ChunkFullNull for BinaryChunked { - fn full_null(name: &str, length: usize) -> Self { + fn full_null(name: PlSmallStr, length: usize) -> Self { let arr = BinaryViewArray::new_null(DataType::Binary.to_arrow(CompatLevel::newest()), length); ChunkedArray::with_chunk(name, arr) @@ -79,7 +79,7 @@ impl ChunkFullNull for BinaryChunked { } impl<'a> ChunkFull<&'a [u8]> for BinaryOffsetChunked { - fn full(name: &str, value: &'a [u8], length: usize) -> Self { + fn full(name: PlSmallStr, value: &'a [u8], length: usize) -> Self { let mut mutable = MutableBinaryArray::with_capacities(length, length * value.len()); mutable.extend_values(std::iter::repeat(value).take(length)); let arr: BinaryArray = mutable.into(); @@ -90,7 +90,7 @@ impl<'a> ChunkFull<&'a [u8]> for BinaryOffsetChunked { } impl ChunkFullNull for BinaryOffsetChunked { - fn full_null(name: &str, length: usize) -> Self { + fn full_null(name: PlSmallStr, length: usize) -> Self { let arr = BinaryArray::::new_null( DataType::BinaryOffset.to_arrow(CompatLevel::newest()), length, @@ -100,7 +100,7 @@ impl ChunkFullNull for BinaryOffsetChunked { } impl ChunkFull<&Series> for ListChunked { - fn full(name: &str, value: &Series, length: usize) -> ListChunked { + fn full(name: PlSmallStr, value: &Series, length: usize) -> ListChunked { let mut builder = get_list_builder(value.dtype(), value.len() * length, length, name).unwrap(); for _ in 0..length { @@ -111,7 +111,7 @@ impl ChunkFull<&Series> for ListChunked { } impl ChunkFullNull for ListChunked { - fn full_null(name: &str, length: usize) -> ListChunked { + fn full_null(name: PlSmallStr, length: usize) -> ListChunked { ListChunked::full_null_with_dtype(name, length, &DataType::Null) } } @@ -119,7 +119,7 @@ impl ChunkFullNull for ListChunked { #[cfg(feature = "dtype-array")] impl ArrayChunked { pub fn full_null_with_dtype( - name: &str, + name: PlSmallStr, length: usize, inner_dtype: &DataType, width: usize, @@ -127,7 +127,7 @@ impl ArrayChunked { let arr = FixedSizeListArray::new_null( ArrowDataType::FixedSizeList( Box::new(ArrowField::new( - "item", + PlSmallStr::from_static("item"), inner_dtype.to_arrow(CompatLevel::newest()), true, )), @@ -141,12 +141,12 @@ impl ArrayChunked { #[cfg(feature = "dtype-array")] impl ChunkFull<&Series> for ArrayChunked { - fn full(name: &str, value: &Series, length: usize) -> ArrayChunked { + fn full(name: PlSmallStr, value: &Series, length: usize) -> ArrayChunked { let width = value.len(); let dtype = value.dtype(); let arrow_dtype = ArrowDataType::FixedSizeList( Box::new(ArrowField::new( - "item", + PlSmallStr::from_static("item"), dtype.to_arrow(CompatLevel::newest()), true, )), @@ -160,16 +160,20 @@ impl ChunkFull<&Series> for ArrayChunked { #[cfg(feature = "dtype-array")] impl ChunkFullNull for ArrayChunked { - fn full_null(name: &str, length: usize) -> ArrayChunked { + fn full_null(name: PlSmallStr, length: usize) -> ArrayChunked { ArrayChunked::full_null_with_dtype(name, length, &DataType::Null, 0) } } impl ListChunked { - pub fn full_null_with_dtype(name: &str, length: usize, inner_dtype: &DataType) -> ListChunked { + pub fn full_null_with_dtype( + name: PlSmallStr, + length: usize, + inner_dtype: &DataType, + ) -> ListChunked { let arr: ListArray = ListArray::new_null( ArrowDataType::LargeList(Box::new(ArrowField::new( - "item", + PlSmallStr::from_static("item"), inner_dtype.to_physical().to_arrow(CompatLevel::newest()), true, ))), @@ -187,8 +191,8 @@ impl ListChunked { } #[cfg(feature = "dtype-struct")] impl ChunkFullNull for StructChunked { - fn full_null(name: &str, length: usize) -> StructChunked { - let s = vec![Series::new_null("", length)]; + fn full_null(name: PlSmallStr, length: usize) -> StructChunked { + let s = vec![Series::new_null(PlSmallStr::const_default(), length)]; StructChunked::from_series(name, &s) .unwrap() .with_outer_validity(Some(Bitmap::new_zeroed(length))) @@ -197,7 +201,7 @@ impl ChunkFullNull for StructChunked { #[cfg(feature = "object")] impl ChunkFull for ObjectChunked { - fn full(name: &str, value: T, length: usize) -> Self + fn full(name: PlSmallStr, value: T, length: usize) -> Self where Self: Sized, { @@ -209,7 +213,7 @@ impl ChunkFull for ObjectChunked { #[cfg(feature = "object")] impl ChunkFullNull for ObjectChunked { - fn full_null(name: &str, length: usize) -> ObjectChunked { + fn full_null(name: PlSmallStr, length: usize) -> ObjectChunked { let mut ca: Self = (0..length).map(|_| None).collect(); ca.rename(name); ca diff --git a/crates/polars-core/src/chunked_array/ops/gather.rs b/crates/polars-core/src/chunked_array/ops/gather.rs index 00c93053cc1e..135621b7a245 100644 --- a/crates/polars-core/src/chunked_array/ops/gather.rs +++ b/crates/polars-core/src/chunked_array/ops/gather.rs @@ -244,7 +244,7 @@ impl ChunkTakeUnchecked for BinaryChunked { .map(|arr| take_unchecked(arr.as_ref(), indices_arr)) .collect::>(); - let mut out = ChunkedArray::from_chunks(self.name(), chunks); + let mut out = ChunkedArray::from_chunks(self.name().clone(), chunks); let sorted_flag = _update_gather_sorted_flag(self.is_sorted_flag(), indices.is_sorted_flag()); @@ -264,7 +264,7 @@ impl ChunkTakeUnchecked for StringChunked { impl + ?Sized> ChunkTakeUnchecked for BinaryChunked { /// Gather values from ChunkedArray by index. unsafe fn take_unchecked(&self, indices: &I) -> Self { - let indices = IdxCa::mmap_slice("", indices.as_ref()); + let indices = IdxCa::mmap_slice(PlSmallStr::const_default(), indices.as_ref()); self.take_unchecked(&indices) } } @@ -296,7 +296,7 @@ impl ChunkTakeUnchecked for StructChunked { #[cfg(feature = "dtype-struct")] impl + ?Sized> ChunkTakeUnchecked for StructChunked { unsafe fn take_unchecked(&self, indices: &I) -> Self { - let idx = IdxCa::mmap_slice("", indices.as_ref()); + let idx = IdxCa::mmap_slice(PlSmallStr::const_default(), indices.as_ref()); self.take_unchecked(&idx) } } @@ -307,7 +307,7 @@ impl IdxCa { let idx = bytemuck::cast_slice::<_, IdxSize>(idx); let arr = unsafe { arrow::ffi::mmap::slice(idx) }; let arr = arr.with_validity_typed(Some(validity)); - let ca = IdxCa::with_chunk("", arr); + let ca = IdxCa::with_chunk(PlSmallStr::const_default(), arr); f(&ca) } diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index f946fce715e6..547497f3f1f8 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -131,7 +131,7 @@ pub trait ChunkSet<'a, A, B> { /// /// ```rust /// # use polars_core::prelude::*; - /// let ca = UInt32Chunked::new("a", &[1, 2, 3]); + /// let ca = UInt32Chunked::new("a".into(), &[1, 2, 3]); /// let new = ca.scatter_single(vec![0, 1], Some(10)).unwrap(); /// /// assert_eq!(Vec::from(&new), &[Some(10), Some(10), Some(3)]); @@ -150,7 +150,7 @@ pub trait ChunkSet<'a, A, B> { /// /// ```rust /// # use polars_core::prelude::*; - /// let ca = Int32Chunked::new("a", &[1, 2, 3]); + /// let ca = Int32Chunked::new("a".into(), &[1, 2, 3]); /// let new = ca.scatter_with(vec![0, 1], |opt_v| opt_v.map(|v| v - 5)).unwrap(); /// /// assert_eq!(Vec::from(&new), &[Some(-4), Some(-3), Some(3)]); @@ -169,8 +169,8 @@ pub trait ChunkSet<'a, A, B> { /// /// ```rust /// # use polars_core::prelude::*; - /// let ca = Int32Chunked::new("a", &[1, 2, 3]); - /// let mask = BooleanChunked::new("mask", &[false, true, false]); + /// let ca = Int32Chunked::new("a".into(), &[1, 2, 3]); + /// let mask = BooleanChunked::new("mask".into(), &[false, true, false]); /// let new = ca.set(&mask, Some(5)).unwrap(); /// assert_eq!(Vec::from(&new), &[Some(1), Some(5), Some(3)]); /// ``` @@ -425,13 +425,13 @@ pub trait ChunkFillNullValue { /// Fill a ChunkedArray with one value. pub trait ChunkFull { /// Create a ChunkedArray with a single value. - fn full(name: &str, value: T, length: usize) -> Self + fn full(name: PlSmallStr, value: T, length: usize) -> Self where Self: Sized; } pub trait ChunkFullNull { - fn full_null(_name: &str, _length: usize) -> Self + fn full_null(_name: PlSmallStr, _length: usize) -> Self where Self: Sized; } @@ -448,8 +448,8 @@ pub trait ChunkFilter { /// /// ```rust /// # use polars_core::prelude::*; - /// let array = Int32Chunked::new("array", &[1, 2, 3]); - /// let mask = BooleanChunked::new("mask", &[true, false, true]); + /// let array = Int32Chunked::new("array".into(), &[1, 2, 3]); + /// let mask = BooleanChunked::new("mask".into(), &[true, false, true]); /// /// let filtered = array.filter(&mask).unwrap(); /// assert_eq!(Vec::from(&filtered), [Some(1), Some(3)]) @@ -472,8 +472,8 @@ macro_rules! impl_chunk_expand { } let opt_val = $self.get($index); match opt_val { - Some(val) => ChunkedArray::full($self.name(), val, $length), - None => ChunkedArray::full_null($self.name(), $length), + Some(val) => ChunkedArray::full($self.name().clone(), val, $length), + None => ChunkedArray::full_null($self.name().clone(), $length), } }}; } @@ -526,11 +526,13 @@ impl ChunkExpandAtIndex for ListChunked { let opt_val = self.get_as_series(index); match opt_val { Some(val) => { - let mut ca = ListChunked::full(self.name(), &val, length); + let mut ca = ListChunked::full(self.name().clone(), &val, length); unsafe { ca.to_logical(self.inner_dtype().clone()) }; ca }, - None => ListChunked::full_null_with_dtype(self.name(), length, self.inner_dtype()), + None => { + ListChunked::full_null_with_dtype(self.name().clone(), length, self.inner_dtype()) + }, } } } @@ -547,7 +549,7 @@ impl ChunkExpandAtIndex for StructChunked { .values() .iter() .map(|arr| { - let s = Series::try_from(("", arr.clone())).unwrap(); + let s = Series::try_from((PlSmallStr::const_default(), arr.clone())).unwrap(); let s = s.new_from_index(idx, length); s.chunks()[0].clone() }) @@ -567,12 +569,12 @@ impl ChunkExpandAtIndex for ArrayChunked { let opt_val = self.get_as_series(index); match opt_val { Some(val) => { - let mut ca = ArrayChunked::full(self.name(), &val, length); + let mut ca = ArrayChunked::full(self.name().clone(), &val, length); unsafe { ca.to_logical(self.inner_dtype().clone()) }; ca }, None => ArrayChunked::full_null_with_dtype( - self.name(), + self.name().clone(), length, self.inner_dtype(), self.width(), @@ -586,8 +588,8 @@ impl ChunkExpandAtIndex> for ObjectChunked { fn new_from_index(&self, index: usize, length: usize) -> ObjectChunked { let opt_val = self.get(index); match opt_val { - Some(val) => ObjectChunked::::full(self.name(), val.clone(), length), - None => ObjectChunked::::full_null(self.name(), length), + Some(val) => ObjectChunked::::full(self.name().clone(), val.clone(), length), + None => ObjectChunked::::full_null(self.name().clone(), length), } } } diff --git a/crates/polars-core/src/chunked_array/ops/nulls.rs b/crates/polars-core/src/chunked_array/ops/nulls.rs index c0ba435c4a51..1d1640055a72 100644 --- a/crates/polars-core/src/chunked_array/ops/nulls.rs +++ b/crates/polars-core/src/chunked_array/ops/nulls.rs @@ -7,19 +7,19 @@ impl ChunkedArray { /// Get a mask of the null values. pub fn is_null(&self) -> BooleanChunked { if !self.has_nulls() { - return BooleanChunked::full(self.name(), false, self.len()); + return BooleanChunked::full(self.name().clone(), false, self.len()); } // dispatch to non-generic function - is_null(self.name(), &self.chunks) + is_null(self.name().clone(), &self.chunks) } /// Get a mask of the valid values. pub fn is_not_null(&self) -> BooleanChunked { if self.null_count() == 0 { - return BooleanChunked::full(self.name(), true, self.len()); + return BooleanChunked::full(self.name().clone(), true, self.len()); } // dispatch to non-generic function - is_not_null(self.name(), &self.chunks) + is_not_null(self.name().clone(), &self.chunks) } pub(crate) fn coalesce_nulls(&self, other: &[ArrayRef]) -> Self { @@ -30,7 +30,7 @@ impl ChunkedArray { } } -pub fn is_not_null(name: &str, chunks: &[ArrayRef]) -> BooleanChunked { +pub fn is_not_null(name: PlSmallStr, chunks: &[ArrayRef]) -> BooleanChunked { let chunks = chunks.iter().map(|arr| { let bitmap = arr .validity() @@ -41,7 +41,7 @@ pub fn is_not_null(name: &str, chunks: &[ArrayRef]) -> BooleanChunked { BooleanChunked::from_chunk_iter(name, chunks) } -pub fn is_null(name: &str, chunks: &[ArrayRef]) -> BooleanChunked { +pub fn is_null(name: PlSmallStr, chunks: &[ArrayRef]) -> BooleanChunked { let chunks = chunks.iter().map(|arr| { let bitmap = arr .validity() @@ -52,7 +52,7 @@ pub fn is_null(name: &str, chunks: &[ArrayRef]) -> BooleanChunked { BooleanChunked::from_chunk_iter(name, chunks) } -pub fn replace_non_null(name: &str, chunks: &[ArrayRef], default: bool) -> BooleanChunked { +pub fn replace_non_null(name: PlSmallStr, chunks: &[ArrayRef], default: bool) -> BooleanChunked { BooleanChunked::from_chunk_iter( name, chunks.iter().map(|el| { diff --git a/crates/polars-core/src/chunked_array/ops/reverse.rs b/crates/polars-core/src/chunked_array/ops/reverse.rs index 9d3b0938f390..737867030a59 100644 --- a/crates/polars-core/src/chunked_array/ops/reverse.rs +++ b/crates/polars-core/src/chunked_array/ops/reverse.rs @@ -15,7 +15,7 @@ where } else { self.into_iter().rev().collect_trusted() }; - out.rename(self.name()); + out.rename(self.name().clone()); match self.is_sorted_flag() { IsSorted::Ascending => out.set_sorted_flag(IsSorted::Descending), @@ -32,7 +32,7 @@ macro_rules! impl_reverse { impl ChunkReverse for $ca_type { fn reverse(&self) -> Self { let mut ca: Self = self.into_iter().rev().collect_trusted(); - ca.rename(self.name()); + ca.rename(self.name().clone()); ca } } @@ -60,13 +60,16 @@ impl ChunkReverse for BinaryChunked { ) .boxed(); BinaryChunked::from_chunks_and_dtype_unchecked( - self.name(), + self.name().clone(), vec![arr], self.dtype().clone(), ) } } else { - let ca = IdxCa::from_vec("", (0..self.len() as IdxSize).rev().collect()); + let ca = IdxCa::from_vec( + PlSmallStr::const_default(), + (0..self.len() as IdxSize).rev().collect(), + ); unsafe { self.take_unchecked(&ca) } } } @@ -89,7 +92,7 @@ impl ChunkReverse for ArrayChunked { let values = arr.values().as_ref(); let mut builder = - get_fixed_size_list_builder(ca.inner_dtype(), ca.len(), ca.width(), ca.name()) + get_fixed_size_list_builder(ca.inner_dtype(), ca.len(), ca.width(), ca.name().clone()) .expect("not yet supported"); // SAFETY, we are within bounds @@ -117,6 +120,12 @@ impl ChunkReverse for ArrayChunked { impl ChunkReverse for ObjectChunked { fn reverse(&self) -> Self { // SAFETY: we know we don't go out of bounds. - unsafe { self.take_unchecked(&(0..self.len() as IdxSize).rev().collect_ca("")) } + unsafe { + self.take_unchecked( + &(0..self.len() as IdxSize) + .rev() + .collect_ca(PlSmallStr::const_default()), + ) + } } } diff --git a/crates/polars-core/src/chunked_array/ops/rolling_window.rs b/crates/polars-core/src/chunked_array/ops/rolling_window.rs index c5898edb4df1..fb44f1ab946a 100644 --- a/crates/polars-core/src/chunked_array/ops/rolling_window.rs +++ b/crates/polars-core/src/chunked_array/ops/rolling_window.rs @@ -106,14 +106,16 @@ mod inner_mod { let len = self.len(); let arr = ca.downcast_iter().next().unwrap(); - let mut ca = ChunkedArray::::from_slice("", &[T::Native::zero()]); + let mut ca = + ChunkedArray::::from_slice(PlSmallStr::const_default(), &[T::Native::zero()]); let ptr = ca.chunks[0].as_mut() as *mut dyn Array as *mut PrimitiveArray; let mut series_container = ca.into_series(); - let mut builder = PrimitiveChunkedBuilder::::new(self.name(), self.len()); + let mut builder = PrimitiveChunkedBuilder::::new(self.name().clone(), self.len()); if let Some(weights) = options.weights { - let weights_series = Float64Chunked::new("weights", &weights).into_series(); + let weights_series = + Float64Chunked::new(PlSmallStr::from_static("weights"), &weights).into_series(); let weights_series = weights_series.cast(self.dtype()).unwrap(); @@ -221,7 +223,7 @@ mod inner_mod { F: FnMut(&mut ChunkedArray) -> Option, { if window_size > self.len() { - return Ok(Self::full_null(self.name(), self.len())); + return Ok(Self::full_null(self.name().clone(), self.len())); } let ca = self.rechunk(); let arr = ca.downcast_iter().next().unwrap(); @@ -229,7 +231,8 @@ mod inner_mod { // We create a temporary dummy ChunkedArray. This will be a // container where we swap the window contents every iteration doing // so will save a lot of heap allocations. - let mut heap_container = ChunkedArray::::from_slice("", &[T::Native::zero()]); + let mut heap_container = + ChunkedArray::::from_slice(PlSmallStr::const_default(), &[T::Native::zero()]); let ptr = heap_container.chunks[0].as_mut() as *mut dyn Array as *mut PrimitiveArray; @@ -274,7 +277,7 @@ mod inner_mod { values.into(), Some(validity.into()), ); - Ok(Self::with_chunk(self.name(), arr)) + Ok(Self::with_chunk(self.name().clone(), arr)) } } } diff --git a/crates/polars-core/src/chunked_array/ops/set.rs b/crates/polars-core/src/chunked_array/ops/set.rs index 77aba5673ea2..5717cacae98e 100644 --- a/crates/polars-core/src/chunked_array/ops/set.rs +++ b/crates/polars-core/src/chunked_array/ops/set.rs @@ -57,7 +57,7 @@ where value, T::get_dtype().to_arrow(CompatLevel::newest()), )?; - return Ok(Self::with_chunk(self.name(), arr)); + return Ok(Self::with_chunk(self.name().clone(), arr)); } // Other fast path. Slightly slower as it does not do a memcpy. else { @@ -71,7 +71,7 @@ where *val = value; Ok(()) })?; - return Ok(Self::from_vec(self.name(), av)); + return Ok(Self::from_vec(self.name().clone(), av)); } } } @@ -86,7 +86,7 @@ where where F: Fn(Option) -> Option, { - let mut builder = PrimitiveChunkedBuilder::::new(self.name(), self.len()); + let mut builder = PrimitiveChunkedBuilder::::new(self.name().clone(), self.len()); impl_scatter_with!(self, builder, idx, f) } @@ -109,7 +109,7 @@ where T::get_dtype().to_arrow(CompatLevel::newest()), ) }); - Ok(ChunkedArray::from_chunk_iter(self.name(), chunks)) + Ok(ChunkedArray::from_chunk_iter(self.name().clone(), chunks)) } else { // slow path, could be optimized. let ca = mask @@ -120,7 +120,7 @@ where _ => opt_val, }) .collect_trusted::() - .with_name(self.name()); + .with_name(self.name().clone()); Ok(ca) } } @@ -160,7 +160,7 @@ impl<'a> ChunkSet<'a, bool, bool> for BooleanChunked { validity.set(i, f(input).unwrap_or(false)); } let arr = BooleanArray::from_data_default(values.into(), Some(validity.into())); - Ok(BooleanChunked::with_chunk(self.name(), arr)) + Ok(BooleanChunked::with_chunk(self.name().clone(), arr)) } fn set(&'a self, mask: &BooleanChunked, value: Option) -> PolarsResult { @@ -173,7 +173,7 @@ impl<'a> ChunkSet<'a, bool, bool> for BooleanChunked { _ => opt_val, }) .collect_trusted::() - .with_name(self.name()); + .with_name(self.name().clone()); Ok(ca) } } @@ -189,7 +189,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for StringChunked { { let idx_iter = idx.into_iter(); let mut ca_iter = self.into_iter().enumerate(); - let mut builder = StringChunkedBuilder::new(self.name(), self.len()); + let mut builder = StringChunkedBuilder::new(self.name().clone(), self.len()); for current_idx in idx_iter.into_iter().map(|i| i as usize) { polars_ensure!(current_idx < self.len(), oob = current_idx, self.len()); @@ -220,7 +220,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for StringChunked { Self: Sized, F: Fn(Option<&'a str>) -> Option, { - let mut builder = StringChunkedBuilder::new(self.name(), self.len()); + let mut builder = StringChunkedBuilder::new(self.name().clone(), self.len()); impl_scatter_with!(self, builder, idx, f) } @@ -237,7 +237,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for StringChunked { _ => opt_val, }) .collect_trusted::() - .with_name(self.name()); + .with_name(self.name().clone()); Ok(ca) } } @@ -252,7 +252,7 @@ impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { Self: Sized, { let mut ca_iter = self.into_iter().enumerate(); - let mut builder = BinaryChunkedBuilder::new(self.name(), self.len()); + let mut builder = BinaryChunkedBuilder::new(self.name().clone(), self.len()); for current_idx in idx.into_iter().map(|i| i as usize) { polars_ensure!(current_idx < self.len(), oob = current_idx, self.len()); @@ -283,7 +283,7 @@ impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { Self: Sized, F: Fn(Option<&'a [u8]>) -> Option>, { - let mut builder = BinaryChunkedBuilder::new(self.name(), self.len()); + let mut builder = BinaryChunkedBuilder::new(self.name().clone(), self.len()); impl_scatter_with!(self, builder, idx, f) } @@ -300,7 +300,7 @@ impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { _ => opt_val, }) .collect_trusted::() - .with_name(self.name()); + .with_name(self.name().clone()); Ok(ca) } } @@ -311,23 +311,26 @@ mod test { #[test] fn test_set() { - let ca = Int32Chunked::new("a", &[1, 2, 3]); - let mask = BooleanChunked::new("mask", &[false, true, false]); + let ca = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let mask = BooleanChunked::new(PlSmallStr::from_static("mask"), &[false, true, false]); let ca = ca.set(&mask, Some(5)).unwrap(); assert_eq!(Vec::from(&ca), &[Some(1), Some(5), Some(3)]); - let ca = Int32Chunked::new("a", &[1, 2, 3]); - let mask = BooleanChunked::new("mask", &[None, Some(true), None]); + let ca = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let mask = BooleanChunked::new(PlSmallStr::from_static("mask"), &[None, Some(true), None]); let ca = ca.set(&mask, Some(5)).unwrap(); assert_eq!(Vec::from(&ca), &[Some(1), Some(5), Some(3)]); - let ca = Int32Chunked::new("a", &[1, 2, 3]); - let mask = BooleanChunked::new("mask", &[None, None, None]); + let ca = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let mask = BooleanChunked::new(PlSmallStr::from_static("mask"), &[None, None, None]); let ca = ca.set(&mask, Some(5)).unwrap(); assert_eq!(Vec::from(&ca), &[Some(1), Some(2), Some(3)]); - let ca = Int32Chunked::new("a", &[1, 2, 3]); - let mask = BooleanChunked::new("mask", &[Some(true), Some(false), None]); + let ca = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let mask = BooleanChunked::new( + PlSmallStr::from_static("mask"), + &[Some(true), Some(false), None], + ); let ca = ca.set(&mask, Some(5)).unwrap(); assert_eq!(Vec::from(&ca), &[Some(5), Some(2), Some(3)]); @@ -337,30 +340,39 @@ mod test { assert!(ca.scatter_single(vec![0, 10], Some(0)).is_err()); // test booleans - let ca = BooleanChunked::new("a", &[true, true, true]); - let mask = BooleanChunked::new("mask", &[false, true, false]); + let ca = BooleanChunked::new(PlSmallStr::from_static("a"), &[true, true, true]); + let mask = BooleanChunked::new(PlSmallStr::from_static("mask"), &[false, true, false]); let ca = ca.set(&mask, None).unwrap(); assert_eq!(Vec::from(&ca), &[Some(true), None, Some(true)]); // test string - let ca = StringChunked::new("a", &["foo", "foo", "foo"]); - let mask = BooleanChunked::new("mask", &[false, true, false]); + let ca = StringChunked::new(PlSmallStr::from_static("a"), &["foo", "foo", "foo"]); + let mask = BooleanChunked::new(PlSmallStr::from_static("mask"), &[false, true, false]); let ca = ca.set(&mask, Some("bar")).unwrap(); assert_eq!(Vec::from(&ca), &[Some("foo"), Some("bar"), Some("foo")]); } #[test] fn test_set_null_values() { - let ca = Int32Chunked::new("a", &[Some(1), None, Some(3)]); - let mask = BooleanChunked::new("mask", &[Some(false), Some(true), None]); + let ca = Int32Chunked::new(PlSmallStr::from_static("a"), &[Some(1), None, Some(3)]); + let mask = BooleanChunked::new( + PlSmallStr::from_static("mask"), + &[Some(false), Some(true), None], + ); let ca = ca.set(&mask, Some(2)).unwrap(); assert_eq!(Vec::from(&ca), &[Some(1), Some(2), Some(3)]); - let ca = StringChunked::new("a", &[Some("foo"), None, Some("bar")]); + let ca = StringChunked::new( + PlSmallStr::from_static("a"), + &[Some("foo"), None, Some("bar")], + ); let ca = ca.set(&mask, Some("foo")).unwrap(); assert_eq!(Vec::from(&ca), &[Some("foo"), Some("foo"), Some("bar")]); - let ca = BooleanChunked::new("a", &[Some(false), None, Some(true)]); + let ca = BooleanChunked::new( + PlSmallStr::from_static("a"), + &[Some(false), None, Some(true)], + ); let ca = ca.set(&mask, Some(true)).unwrap(); assert_eq!(Vec::from(&ca), &[Some(false), Some(true), Some(true)]); } diff --git a/crates/polars-core/src/chunked_array/ops/shift.rs b/crates/polars-core/src/chunked_array/ops/shift.rs index b200635dfe71..54f1f2ff0deb 100644 --- a/crates/polars-core/src/chunked_array/ops/shift.rs +++ b/crates/polars-core/src/chunked_array/ops/shift.rs @@ -9,8 +9,8 @@ macro_rules! impl_shift_fill { if fill_length >= $self.len() { return match $fill_value { - Some(fill) => Self::full($self.name(), fill, $self.len()), - None => Self::full_null($self.name(), $self.len()), + Some(fill) => Self::full($self.name().clone(), fill, $self.len()), + None => Self::full_null($self.name().clone(), $self.len()), }; } let slice_offset = (-$periods).max(0) as i64; @@ -18,8 +18,8 @@ macro_rules! impl_shift_fill { let mut slice = $self.slice(slice_offset, length); let mut fill = match $fill_value { - Some(val) => Self::full($self.name(), val, fill_length), - None => Self::full_null($self.name(), fill_length), + Some(val) => Self::full($self.name().clone(), val, fill_length), + None => Self::full_null($self.name().clone(), fill_length), }; if $periods < 0 { @@ -112,8 +112,12 @@ impl ChunkShiftFill> for ListChunked { let fill_length = abs(periods) as usize; let mut fill = match fill_value { - Some(val) => Self::full(self.name(), val, fill_length), - None => ListChunked::full_null_with_dtype(self.name(), fill_length, self.inner_dtype()), + Some(val) => Self::full(self.name().clone(), val, fill_length), + None => ListChunked::full_null_with_dtype( + self.name().clone(), + fill_length, + self.inner_dtype(), + ), }; if periods < 0 { @@ -144,10 +148,13 @@ impl ChunkShiftFill> for ArrayChunked { let fill_length = abs(periods) as usize; let mut fill = match fill_value { - Some(val) => Self::full(self.name(), val, fill_length), - None => { - ArrayChunked::full_null_with_dtype(self.name(), fill_length, self.inner_dtype(), 0) - }, + Some(val) => Self::full(self.name().clone(), val, fill_length), + None => ArrayChunked::full_null_with_dtype( + self.name().clone(), + fill_length, + self.inner_dtype(), + 0, + ), }; if periods < 0 { @@ -197,7 +204,7 @@ impl ChunkShift for StructChunked { let fill_length = abs(periods) as usize; // Go via null, so the cast creates the proper struct type. - let fill = NullChunked::new(self.name().into(), fill_length) + let fill = NullChunked::new(self.name().clone(), fill_length) .cast(self.dtype(), Default::default()) .unwrap(); let mut fill = fill.struct_().unwrap().clone(); @@ -218,7 +225,7 @@ mod test { #[test] fn test_shift() { - let ca = Int32Chunked::new("", &[1, 2, 3]); + let ca = Int32Chunked::new(PlSmallStr::const_default(), &[1, 2, 3]); // shift by 0, 1, 2, 3, 4 let shifted = ca.shift_and_fill(0, Some(5)); @@ -251,7 +258,7 @@ mod test { assert_eq!(Vec::from(&shifted), &[Some(3), None, None]); // string - let s = Series::new("a", ["a", "b", "c"]); + let s = Series::new(PlSmallStr::from_static("a"), ["a", "b", "c"]); let shifted = s.shift(-1); assert_eq!( Vec::from(shifted.str().unwrap()), diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs index 724adbebe818..ca34d37318a7 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs @@ -14,7 +14,7 @@ where } pub(super) fn arg_sort( - name: &str, + name: PlSmallStr, iters: I, options: SortOptions, null_count: usize, @@ -69,7 +69,7 @@ where } pub(super) fn arg_sort_no_nulls( - name: &str, + name: PlSmallStr, iters: I, options: SortOptions, len: usize, diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs index 734d6cab79a2..f4ec3520dddc 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs @@ -121,7 +121,10 @@ pub fn encode_rows_vertical_par_unordered(by: &[Series]) -> PolarsResult>>()); - Ok(BinaryOffsetChunked::from_chunk_iter("", chunks?)) + Ok(BinaryOffsetChunked::from_chunk_iter( + PlSmallStr::const_default(), + chunks?, + )) } // Almost the same but broadcast nulls to the row-encoded array. @@ -156,12 +159,18 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( }); let chunks = POOL.install(|| chunks.collect::>>()); - Ok(BinaryOffsetChunked::from_chunk_iter("", chunks?)) + Ok(BinaryOffsetChunked::from_chunk_iter( + PlSmallStr::const_default(), + chunks?, + )) } pub(crate) fn encode_rows_unordered(by: &[Series]) -> PolarsResult { let rows = _get_rows_encoded_unordered(by)?; - Ok(BinaryOffsetChunked::with_chunk("", rows.into_array())) + Ok(BinaryOffsetChunked::with_chunk( + PlSmallStr::const_default(), + rows.into_array(), + )) } pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult { @@ -226,7 +235,7 @@ pub fn _get_rows_encoded( } pub fn _get_rows_encoded_ca( - name: &str, + name: PlSmallStr, by: &[Series], descending: &[bool], nulls_last: &[bool], @@ -244,7 +253,7 @@ pub fn _get_rows_encoded_arr( } pub fn _get_rows_encoded_ca_unordered( - name: &str, + name: PlSmallStr, by: &[Series], ) -> PolarsResult { _get_rows_encoded_unordered(by) diff --git a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs index afc80026313b..3c38c7ffe608 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs @@ -19,7 +19,7 @@ impl CategoricalChunked { let cats: UInt32Chunked = vals .into_iter() .map(|(idx, _v)| idx) - .collect_ca_trusted(self.name()); + .collect_ca_trusted(self.name().clone()); // SAFETY: // we only reordered the indexes so we are still in bounds @@ -61,7 +61,7 @@ impl CategoricalChunked { if self.uses_lexical_ordering() { let iters = [self.iter_str()]; arg_sort::arg_sort( - self.name(), + self.name().clone(), iters, options, self.physical().null_count(), @@ -124,7 +124,7 @@ mod test { enable_string_cache(); } - let s = Series::new("", init) + let s = Series::new(PlSmallStr::const_default(), init) .cast(&DataType::Categorical(None, CategoricalOrdering::Lexical))?; let ca = s.categorical()?; let ca_lexical = ca.clone(); @@ -132,7 +132,8 @@ mod test { let out = ca_lexical.sort(false); assert_order(&out, &["a", "b", "c", "d"]); - let s = Series::new("", init).cast(&DataType::Categorical(None, Default::default()))?; + let s = Series::new(PlSmallStr::const_default(), init) + .cast(&DataType::Categorical(None, Default::default()))?; let ca = s.categorical()?; let out = ca.sort(false); @@ -159,7 +160,7 @@ mod test { enable_string_cache(); } - let s = Series::new("", init) + let s = Series::new(PlSmallStr::const_default(), init) .cast(&DataType::Categorical(None, CategoricalOrdering::Lexical))?; let ca = s.categorical()?; let ca_lexical: CategoricalChunked = ca.clone(); diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index eb16506d5127..1c1940b6f10d 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -165,7 +165,7 @@ where sort_impl_unstable(vals.as_mut_slice(), options); - let mut ca = ChunkedArray::from_vec(ca.name(), vals); + let mut ca = ChunkedArray::from_vec(ca.name().clone(), vals); let s = if options.descending { IsSorted::Descending } else { @@ -205,7 +205,7 @@ where vals.into(), Some(create_validity(len, null_count, options.nulls_last)), ); - let mut new_ca = ChunkedArray::with_chunk(ca.name(), arr); + let mut new_ca = ChunkedArray::with_chunk(ca.name().clone(), arr); let s = if options.descending { IsSorted::Descending } else { @@ -225,12 +225,12 @@ where let iter = ca .downcast_iter() .map(|arr| arr.values().as_slice().iter().copied()); - arg_sort::arg_sort_no_nulls(ca.name(), iter, options, ca.len()) + arg_sort::arg_sort_no_nulls(ca.name().clone(), iter, options, ca.len()) } else { let iter = ca .downcast_iter() .map(|arr| arr.iter().map(|opt| opt.copied())); - arg_sort::arg_sort(ca.name(), iter, options, ca.null_count(), ca.len()) + arg_sort::arg_sort(ca.name().clone(), iter, options, ca.null_count(), ca.len()) } } @@ -409,14 +409,14 @@ impl ChunkSort for BinaryChunked { fn arg_sort(&self, options: SortOptions) -> IdxCa { if self.null_count() == 0 { arg_sort::arg_sort_no_nulls( - self.name(), + self.name().clone(), self.downcast_iter().map(|arr| arr.values_iter()), options, self.len(), ) } else { arg_sort::arg_sort( - self.name(), + self.name().clone(), self.downcast_iter().map(|arr| arr.iter()), options, self.null_count(), @@ -477,7 +477,7 @@ impl ChunkSort for BinaryOffsetChunked { let arr = unsafe { BinaryArray::from_data_unchecked_default(offsets.into(), values.into(), None) }; - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) }, (_, true) => { for val in v { @@ -495,7 +495,7 @@ impl ChunkSort for BinaryOffsetChunked { Some(create_validity(len, null_count, true)), ) }; - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) }, (_, false) => { offsets.extend(std::iter::repeat(length_so_far).take(null_count)); @@ -514,7 +514,7 @@ impl ChunkSort for BinaryOffsetChunked { Some(create_validity(len, null_count, false)), ) }; - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) }, }; @@ -552,13 +552,16 @@ impl ChunkSort for BinaryOffsetChunked { if self.null_count() == 0 { argsort(&mut idx); - IdxCa::from_vec(self.name(), idx) + IdxCa::from_vec(self.name().clone(), idx) } else { // This branch (almost?) never gets called as the row-encoding also encodes nulls. let (partitioned_part, validity) = partition_nulls(&mut idx, arr.validity().cloned(), options); argsort(partitioned_part); - IdxCa::with_chunk(self.name(), IdxArr::from_data_default(idx.into(), validity)) + IdxCa::with_chunk( + self.name().clone(), + IdxArr::from_data_default(idx.into(), validity), + ) } } @@ -595,7 +598,7 @@ impl ChunkSort for BinaryOffsetChunked { impl StructChunked { pub(crate) fn arg_sort(&self, options: SortOptions) -> IdxCa { let bin = _get_rows_encoded_ca( - self.name(), + self.name().clone(), &[self.clone().into_series()], &[options.descending], &[options.nulls_last], @@ -656,7 +659,7 @@ impl ChunkSort for BooleanChunked { } let mut ca: BooleanChunked = vals.into_iter().collect_trusted(); - ca.rename(self.name()); + ca.rename(self.name().clone()); ca } @@ -672,14 +675,14 @@ impl ChunkSort for BooleanChunked { fn arg_sort(&self, options: SortOptions) -> IdxCa { if self.null_count() == 0 { arg_sort::arg_sort_no_nulls( - self.name(), + self.name().clone(), self.downcast_iter().map(|arr| arr.values_iter()), options, self.len(), ) } else { arg_sort::arg_sort( - self.name(), + self.name().clone(), self.downcast_iter().map(|arr| arr.iter()), options, self.null_count(), @@ -721,7 +724,7 @@ pub(crate) fn convert_sort_column_multi_sort(s: &Series) -> PolarsResult .iter() .map(convert_sort_column_multi_sort) .collect::>>()?; - let mut out = StructChunked::from_series(ca.name(), &new_fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?; out.zip_outer_validity(ca); out.into_series() }, @@ -775,7 +778,7 @@ mod test { #[test] fn test_arg_sort() { let a = Int32Chunked::new( - "a", + PlSmallStr::from_static("a"), &[ Some(1), // 0 Some(5), // 1 @@ -809,7 +812,7 @@ mod test { #[test] fn test_sort() { let a = Int32Chunked::new( - "a", + PlSmallStr::from_static("a"), &[ Some(1), Some(5), @@ -859,7 +862,10 @@ mod test { None ] ); - let b = BooleanChunked::new("b", &[Some(false), Some(true), Some(false)]); + let b = BooleanChunked::new( + PlSmallStr::from_static("b"), + &[Some(false), Some(true), Some(false)], + ); let out = b.sort_with(SortOptions::default().with_order_descending(true)); assert_eq!(Vec::from(&out), &[Some(true), Some(false), Some(false)]); let out = b.sort_with(SortOptions::default().with_order_descending(false)); @@ -869,9 +875,12 @@ mod test { #[test] #[cfg_attr(miri, ignore)] fn test_arg_sort_multiple() -> PolarsResult<()> { - let a = Int32Chunked::new("a", &[1, 2, 1, 1, 3, 4, 3, 3]); - let b = Int64Chunked::new("b", &[0, 1, 2, 3, 4, 5, 6, 1]); - let c = StringChunked::new("c", &["a", "b", "c", "d", "e", "f", "g", "h"]); + let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3, 4, 3, 3]); + let b = Int64Chunked::new(PlSmallStr::from_static("b"), &[0, 1, 2, 3, 4, 5, 6, 1]); + let c = StringChunked::new( + PlSmallStr::from_static("c"), + &["a", "b", "c", "d", "e", "f", "g", "h"], + ); let df = DataFrame::new(vec![a.into_series(), b.into_series(), c.into_series()])?; let out = df.sort(["a", "b", "c"], SortMultipleOptions::default())?; @@ -890,8 +899,12 @@ mod test { ); // now let the first sort be a string - let a = StringChunked::new("a", &["a", "b", "c", "a", "b", "c"]).into_series(); - let b = Int32Chunked::new("b", &[5, 4, 2, 3, 4, 5]).into_series(); + let a = StringChunked::new( + PlSmallStr::from_static("a"), + &["a", "b", "c", "a", "b", "c"], + ) + .into_series(); + let b = Int32Chunked::new(PlSmallStr::from_static("b"), &[5, 4, 2, 3, 4, 5]).into_series(); let df = DataFrame::new(vec![a, b])?; let out = df.sort(["a", "b"], SortMultipleOptions::default())?; @@ -931,7 +944,10 @@ mod test { #[test] fn test_sort_string() { - let ca = StringChunked::new("a", &[Some("a"), None, Some("c"), None, Some("b")]); + let ca = StringChunked::new( + PlSmallStr::from_static("a"), + &[Some("a"), None, Some("c"), None, Some("b")], + ); let out = ca.sort_with(SortOptions { descending: false, nulls_last: false, @@ -970,7 +986,10 @@ mod test { assert_eq!(Vec::from(&out), expected); // no nulls - let ca = StringChunked::new("a", &[Some("a"), Some("c"), Some("b")]); + let ca = StringChunked::new( + PlSmallStr::from_static("a"), + &[Some("a"), Some("c"), Some("b")], + ); let out = ca.sort(false); let expected = &[Some("a"), Some("b"), Some("c")]; assert_eq!(Vec::from(&out), expected); diff --git a/crates/polars-core/src/chunked_array/ops/sort/options.rs b/crates/polars-core/src/chunked_array/ops/sort/options.rs index 8726da26774a..046d0b251b04 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/options.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/options.rs @@ -12,7 +12,7 @@ use crate::prelude::*; /// /// ``` /// # use polars_core::prelude::*; -/// let s = Series::new("a", [Some(5), Some(2), Some(3), Some(4), None].as_ref()); +/// let s = Series::new("a".into(), [Some(5), Some(2), Some(3), Some(4), None].as_ref()); /// let sorted = s /// .sort( /// SortOptions::default() @@ -23,7 +23,7 @@ use crate::prelude::*; /// .unwrap(); /// assert_eq!( /// sorted, -/// Series::new("a", [Some(5), Some(4), Some(3), Some(2), None].as_ref()) +/// Series::new("a".into(), [Some(5), Some(4), Some(3), Some(2), None].as_ref()) /// ); /// ``` #[derive(Copy, Clone, Eq, PartialEq, Debug, Hash)] diff --git a/crates/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs index 40e16f08ed95..6713edc7fc71 100644 --- a/crates/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs @@ -116,7 +116,7 @@ where } let arr: PrimitiveArray = arr.into(); - Ok(ChunkedArray::with_chunk(self.name(), arr)) + Ok(ChunkedArray::with_chunk(self.name().clone(), arr)) } else { let mask = self.not_equal_missing(&self.shift(1)); self.filter(&mask) @@ -149,7 +149,7 @@ where let unique = state.finalize_unique(); - return Ok(Self::with_chunk(self.name(), unique)); + return Ok(Self::with_chunk(self.name().clone(), unique)); } } } @@ -161,7 +161,7 @@ where } fn arg_unique(&self) -> PolarsResult { - Ok(IdxCa::from_vec(self.name(), arg_unique_ca!(self))) + Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self))) } fn n_unique(&self) -> PolarsResult { @@ -230,7 +230,7 @@ impl ChunkUnique for BinaryChunked { set.extend(arr.values_iter()) } Ok(BinaryChunked::from_iter_values( - self.name(), + self.name().clone(), set.iter().copied(), )) }, @@ -241,7 +241,7 @@ impl ChunkUnique for BinaryChunked { set.extend(arr.iter()) } Ok(BinaryChunked::from_iter_options( - self.name(), + self.name().clone(), set.iter().copied(), )) }, @@ -249,7 +249,7 @@ impl ChunkUnique for BinaryChunked { } fn arg_unique(&self) -> PolarsResult { - Ok(IdxCa::from_vec(self.name(), arg_unique_ca!(self))) + Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self))) } fn n_unique(&self) -> PolarsResult { @@ -290,11 +290,11 @@ impl ChunkUnique for BooleanChunked { let unique = state.finalize_unique(); - Ok(Self::with_chunk(self.name(), unique)) + Ok(Self::with_chunk(self.name().clone(), unique)) } fn arg_unique(&self) -> PolarsResult { - Ok(IdxCa::from_vec(self.name(), arg_unique_ca!(self))) + Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self))) } } @@ -304,7 +304,8 @@ mod test { #[test] fn unique() { - let ca = ChunkedArray::::from_slice("a", &[1, 2, 3, 2, 1]); + let ca = + ChunkedArray::::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]); assert_eq!( ca.unique() .unwrap() @@ -313,13 +314,16 @@ mod test { .collect::>(), vec![Some(1), Some(2), Some(3)] ); - let ca = BooleanChunked::from_slice("a", &[true, false, true]); + let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]); assert_eq!( ca.unique().unwrap().into_iter().collect::>(), vec![Some(false), Some(true)] ); - let ca = StringChunked::new("", &[Some("a"), None, Some("a"), Some("b"), None]); + let ca = StringChunked::new( + PlSmallStr::const_default(), + &[Some("a"), None, Some("a"), Some("b"), None], + ); assert_eq!( Vec::from(&ca.unique().unwrap().sort(false)), &[None, Some("a"), Some("b")] @@ -328,7 +332,8 @@ mod test { #[test] fn arg_unique() { - let ca = ChunkedArray::::from_slice("a", &[1, 2, 1, 1, 3]); + let ca = + ChunkedArray::::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]); assert_eq!( ca.arg_unique().unwrap().into_iter().collect::>(), vec![Some(0), Some(1), Some(4)] diff --git a/crates/polars-core/src/chunked_array/ops/zip.rs b/crates/polars-core/src/chunked_array/ops/zip.rs index cf85266581e7..c518954c91e7 100644 --- a/crates/polars-core/src/chunked_array/ops/zip.rs +++ b/crates/polars-core/src/chunked_array/ops/zip.rs @@ -26,7 +26,7 @@ where (1, other_len) => src.new_from_index(0, other_len), _ => polars_bail!(ShapeMismatch: SHAPE_MISMATCH_STR), }; - Ok(ret.with_name(if_true.name())) + Ok(ret.with_name(if_true.name().clone())) } fn bool_null_to_false(mask: &BooleanArray) -> Bitmap { @@ -156,7 +156,7 @@ where polars_bail!(ShapeMismatch: SHAPE_MISMATCH_STR) }; - Ok(ret.with_name(if_true.name())) + Ok(ret.with_name(if_true.name().clone())) } } @@ -237,7 +237,7 @@ impl ChunkZip for StructChunked { .map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs)) .collect::>>()?; - let mut out = StructChunked::from_series(self.name(), &fields)?; + let mut out = StructChunked::from_series(self.name().clone(), &fields)?; // Zip the validities. if (l.null_count + r.null_count) > 0 { diff --git a/crates/polars-core/src/chunked_array/random.rs b/crates/polars-core/src/chunked_array/random.rs index 18b1117669fc..3f65e2c761cc 100644 --- a/crates/polars-core/src/chunked_array/random.rs +++ b/crates/polars-core/src/chunked_array/random.rs @@ -12,7 +12,7 @@ use crate::utils::NoNull; fn create_rand_index_with_replacement(n: usize, len: usize, seed: Option) -> IdxCa { if len == 0 { - return IdxCa::new_vec("", vec![]); + return IdxCa::new_vec(PlSmallStr::const_default(), vec![]); } let mut rng = SmallRng::seed_from_u64(seed.unwrap_or_else(get_global_random_u64)); let dist = Uniform::new(0, len as IdxSize); @@ -45,7 +45,7 @@ fn create_rand_index_no_replacement( IndexVec::USize(v) => v.into_iter().map(|x| x as IdxSize).collect(), }; } - IdxCa::new_vec("", buf) + IdxCa::new_vec(PlSmallStr::const_default(), buf) } impl ChunkedArray @@ -251,7 +251,12 @@ where T::Native: Float, { /// Create [`ChunkedArray`] with samples from a Normal distribution. - pub fn rand_normal(name: &str, length: usize, mean: f64, std_dev: f64) -> PolarsResult { + pub fn rand_normal( + name: PlSmallStr, + length: usize, + mean: f64, + std_dev: f64, + ) -> PolarsResult { let normal = Normal::new(mean, std_dev).map_err(to_compute_err)?; let mut builder = PrimitiveChunkedBuilder::::new(name, length); let mut rng = rand::thread_rng(); @@ -264,7 +269,7 @@ where } /// Create [`ChunkedArray`] with samples from a Standard Normal distribution. - pub fn rand_standard_normal(name: &str, length: usize) -> Self { + pub fn rand_standard_normal(name: PlSmallStr, length: usize) -> Self { let mut builder = PrimitiveChunkedBuilder::::new(name, length); let mut rng = rand::thread_rng(); for _ in 0..length { @@ -276,7 +281,7 @@ where } /// Create [`ChunkedArray`] with samples from a Uniform distribution. - pub fn rand_uniform(name: &str, length: usize, low: f64, high: f64) -> Self { + pub fn rand_uniform(name: PlSmallStr, length: usize, low: f64, high: f64) -> Self { let uniform = Uniform::new(low, high); let mut builder = PrimitiveChunkedBuilder::::new(name, length); let mut rng = rand::thread_rng(); @@ -291,7 +296,7 @@ where impl BooleanChunked { /// Create [`ChunkedArray`] with samples from a Bernoulli distribution. - pub fn rand_bernoulli(name: &str, length: usize, p: f64) -> PolarsResult { + pub fn rand_bernoulli(name: PlSmallStr, length: usize, p: f64) -> PolarsResult { let dist = Bernoulli::new(p).map_err(to_compute_err)?; let mut rng = rand::thread_rng(); let mut builder = BooleanChunkedBuilder::new(name, length); @@ -316,31 +321,71 @@ mod test { // Default samples are random and don't require seeds. assert!(df - .sample_n(&Series::new("s", &[3]), false, false, None) + .sample_n( + &Series::new(PlSmallStr::from_static("s"), &[3]), + false, + false, + None + ) .is_ok()); assert!(df - .sample_frac(&Series::new("frac", &[0.4]), false, false, None) + .sample_frac( + &Series::new(PlSmallStr::from_static("frac"), &[0.4]), + false, + false, + None + ) .is_ok()); // With seeding. assert!(df - .sample_n(&Series::new("s", &[3]), false, false, Some(0)) + .sample_n( + &Series::new(PlSmallStr::from_static("s"), &[3]), + false, + false, + Some(0) + ) .is_ok()); assert!(df - .sample_frac(&Series::new("frac", &[0.4]), false, false, Some(0)) + .sample_frac( + &Series::new(PlSmallStr::from_static("frac"), &[0.4]), + false, + false, + Some(0) + ) .is_ok()); // Without replacement can not sample more than 100%. assert!(df - .sample_frac(&Series::new("frac", &[2.0]), false, false, Some(0)) + .sample_frac( + &Series::new(PlSmallStr::from_static("frac"), &[2.0]), + false, + false, + Some(0) + ) .is_err()); assert!(df - .sample_n(&Series::new("s", &[3]), true, false, Some(0)) + .sample_n( + &Series::new(PlSmallStr::from_static("s"), &[3]), + true, + false, + Some(0) + ) .is_ok()); assert!(df - .sample_frac(&Series::new("frac", &[0.4]), true, false, Some(0)) + .sample_frac( + &Series::new(PlSmallStr::from_static("frac"), &[0.4]), + true, + false, + Some(0) + ) .is_ok()); // With replacement can sample more than 100%. assert!(df - .sample_frac(&Series::new("frac", &[2.0]), true, false, Some(0)) + .sample_frac( + &Series::new(PlSmallStr::from_static("frac"), &[2.0]), + true, + false, + Some(0) + ) .is_ok()); } } diff --git a/crates/polars-core/src/chunked_array/struct_/frame.rs b/crates/polars-core/src/chunked_array/struct_/frame.rs index 28149aba4ca8..280a9df6da56 100644 --- a/crates/polars-core/src/chunked_array/struct_/frame.rs +++ b/crates/polars-core/src/chunked_array/struct_/frame.rs @@ -1,8 +1,10 @@ +use polars_utils::pl_str::PlSmallStr; + use crate::frame::DataFrame; use crate::prelude::StructChunked; impl DataFrame { - pub fn into_struct(self, name: &str) -> StructChunked { + pub fn into_struct(self, name: PlSmallStr) -> StructChunked { StructChunked::from_series(name, &self.columns).expect("same invariants") } } diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index faac8f5f80f1..b5110108a0ce 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -18,7 +18,7 @@ use crate::utils::Container; pub type StructChunked = ChunkedArray; -fn constructor(name: &str, fields: &[Series]) -> PolarsResult { +fn constructor(name: PlSmallStr, fields: &[Series]) -> PolarsResult { // Different chunk lengths: rechunk and recurse. if !fields.iter().map(|s| s.n_chunks()).all_equal() { let fields = fields.iter().map(|s| s.rechunk()).collect::>(); @@ -62,7 +62,7 @@ fn constructor(name: &str, fields: &[Series]) -> PolarsResult { } impl StructChunked { - pub fn from_series(name: &str, fields: &[Series]) -> PolarsResult { + pub fn from_series(name: PlSmallStr, fields: &[Series]) -> PolarsResult { let mut names = PlHashSet::with_capacity(fields.len()); let first_len = fields.first().map(|s| s.len()).unwrap_or(0); let mut max_len = first_len; @@ -110,7 +110,7 @@ impl StructChunked { } constructor(name, &new_fields) } else if fields.is_empty() { - let fields = &[Series::new_null("", 0)]; + let fields = &[Series::new_null(PlSmallStr::const_default(), 0)]; constructor(name, fields) } else { constructor(name, fields) @@ -136,7 +136,11 @@ impl StructChunked { // SAFETY: correct type. unsafe { - Series::from_chunks_and_dtype_unchecked(&field.name, field_chunks, &field.dtype) + Series::from_chunks_and_dtype_unchecked( + field.name.clone(), + field_chunks, + &field.dtype, + ) } }) .collect() @@ -155,7 +159,7 @@ impl StructChunked { let struct_len = self.len(); let new_fields = dtype_fields .iter() - .map(|new_field| match map.get(new_field.name().as_str()) { + .map(|new_field| match map.get(new_field.name()) { Some(s) => { if unchecked { s.cast_unchecked(&new_field.dtype) @@ -164,14 +168,14 @@ impl StructChunked { } }, None => Ok(Series::full_null( - new_field.name(), + new_field.name().clone(), struct_len, &new_field.dtype, )), }) .collect::>>()?; - let mut out = Self::from_series(self.name(), &new_fields)?; + let mut out = Self::from_series(self.name().clone(), &new_fields)?; if self.null_count > 0 { out.zip_outer_validity(self); } @@ -213,7 +217,7 @@ impl StructChunked { scratch.clear(); } let array = builder.freeze().boxed(); - Series::try_from((ca.name(), array)) + Series::try_from((ca.name().clone(), array)) }, _ => { let fields = self @@ -227,7 +231,7 @@ impl StructChunked { } }) .collect::>>()?; - let mut out = Self::from_series(self.name(), &fields)?; + let mut out = Self::from_series(self.name().clone(), &fields)?; if self.null_count > 0 { out.zip_outer_validity(self); } @@ -272,7 +276,7 @@ impl StructChunked { .iter() .map(func) .collect::>>()?; - Self::from_series(self.name(), &fields).map(|mut ca| { + Self::from_series(self.name().clone(), &fields).map(|mut ca| { if self.null_count > 0 { // SAFETY: we don't change types/ lengths. unsafe { @@ -293,7 +297,7 @@ impl StructChunked { pub fn get_row_encoded(&self, options: SortOptions) -> PolarsResult { let s = self.clone().into_series(); _get_rows_encoded_ca( - self.name(), + self.name().clone(), &[s], &[options.descending], &[options.nulls_last], @@ -350,7 +354,7 @@ impl StructChunked { pub fn field_by_name(&self, name: &str) -> PolarsResult { self.fields_as_series() .into_iter() - .find(|s| s.name() == name) + .find(|s| s.name().as_str() == name) .ok_or_else(|| polars_err!(StructFieldNotFound: "{}", name)) } pub(crate) fn set_outer_validity(&mut self, validity: Option) { diff --git a/crates/polars-core/src/chunked_array/temporal/date.rs b/crates/polars-core/src/chunked_array/temporal/date.rs index 26e52d7d2f0f..ea0bb11d10fc 100644 --- a/crates/polars-core/src/chunked_array/temporal/date.rs +++ b/crates/polars-core/src/chunked_array/temporal/date.rs @@ -25,7 +25,7 @@ impl DateChunked { } /// Construct a new [`DateChunked`] from an iterator over [`NaiveDate`]. - pub fn from_naive_date>(name: &str, v: I) -> Self { + pub fn from_naive_date>(name: PlSmallStr, v: I) -> Self { let unit = v.into_iter().map(naive_date_to_date).collect::>(); Int32Chunked::from_vec(name, unit).into() } @@ -51,7 +51,7 @@ impl DateChunked { /// Construct a new [`DateChunked`] from an iterator over optional [`NaiveDate`]. pub fn from_naive_date_options>>( - name: &str, + name: PlSmallStr, v: I, ) -> Self { let unit = v.into_iter().map(|opt| opt.map(naive_date_to_date)); diff --git a/crates/polars-core/src/chunked_array/temporal/datetime.rs b/crates/polars-core/src/chunked_array/temporal/datetime.rs index 838bc2cd9527..92439e5b7527 100644 --- a/crates/polars-core/src/chunked_array/temporal/datetime.rs +++ b/crates/polars-core/src/chunked_array/temporal/datetime.rs @@ -72,7 +72,7 @@ impl DatetimeChunked { )? }, }; - ca.rename(self.name()); + ca.rename(self.name().clone()); Ok(ca) } @@ -86,7 +86,7 @@ impl DatetimeChunked { /// Construct a new [`DatetimeChunked`] from an iterator over [`NaiveDateTime`]. pub fn from_naive_datetime>( - name: &str, + name: PlSmallStr, v: I, tu: TimeUnit, ) -> Self { @@ -100,7 +100,7 @@ impl DatetimeChunked { } pub fn from_naive_datetime_options>>( - name: &str, + name: PlSmallStr, v: I, tu: TimeUnit, ) -> Self { @@ -205,7 +205,7 @@ mod test { // NOTE: the values are checked and correct. let dt = DatetimeChunked::from_naive_datetime( - "name", + PlSmallStr::from_static("name"), datetimes.iter().copied(), TimeUnit::Nanoseconds, ); diff --git a/crates/polars-core/src/chunked_array/temporal/duration.rs b/crates/polars-core/src/chunked_array/temporal/duration.rs index 7c649e3178b0..df8a51388baf 100644 --- a/crates/polars-core/src/chunked_array/temporal/duration.rs +++ b/crates/polars-core/src/chunked_array/temporal/duration.rs @@ -62,7 +62,7 @@ impl DurationChunked { /// Construct a new [`DurationChunked`] from an iterator over [`ChronoDuration`]. pub fn from_duration>( - name: &str, + name: PlSmallStr, v: I, tu: TimeUnit, ) -> Self { @@ -77,7 +77,7 @@ impl DurationChunked { /// Construct a new [`DurationChunked`] from an iterator over optional [`ChronoDuration`]. pub fn from_duration_options>>( - name: &str, + name: PlSmallStr, v: I, tu: TimeUnit, ) -> Self { diff --git a/crates/polars-core/src/chunked_array/temporal/mod.rs b/crates/polars-core/src/chunked_array/temporal/mod.rs index d9f50fe9ad96..e3ab1c01c164 100644 --- a/crates/polars-core/src/chunked_array/temporal/mod.rs +++ b/crates/polars-core/src/chunked_array/temporal/mod.rs @@ -17,6 +17,8 @@ use chrono::NaiveTime; use chrono_tz::Tz; #[cfg(feature = "timezones")] use once_cell::sync::Lazy; +#[cfg(feature = "timezones")] +use polars_utils::pl_str::PlSmallStr; #[cfg(all(feature = "regex", feature = "timezones"))] use regex::Regex; #[cfg(feature = "dtype-time")] @@ -68,14 +70,16 @@ pub fn parse_time_zone(tz: &str) -> PolarsResult { /// > In the "Etc" area, zones west of GMT have a positive sign and those east /// > have a negative sign in their name (e.g "Etc/GMT-14" is 14 hours ahead of GMT). #[cfg(feature = "timezones")] -pub fn parse_fixed_offset(tz: &str) -> PolarsResult { +pub fn parse_fixed_offset(tz: &str) -> PolarsResult { + use polars_utils::format_pl_smallstr; + if let Some(caps) = FIXED_OFFSET_RE.captures(tz) { let sign = match caps.name("sign").map(|s| s.as_str()) { Some("-") => "+", _ => "-", }; let hour = caps.name("hour").unwrap().as_str().parse::().unwrap(); - let etc_tz = format!("Etc/GMT{}{}", sign, hour); + let etc_tz = format_pl_smallstr!("Etc/GMT{}{}", sign, hour); if etc_tz.parse::().is_ok() { return Ok(etc_tz); } diff --git a/crates/polars-core/src/chunked_array/temporal/time.rs b/crates/polars-core/src/chunked_array/temporal/time.rs index 3627189052a5..77e204c765de 100644 --- a/crates/polars-core/src/chunked_array/temporal/time.rs +++ b/crates/polars-core/src/chunked_array/temporal/time.rs @@ -40,7 +40,7 @@ impl TimeChunked { mutarr.freeze().boxed() }); - ca.rename(self.name()); + ca.rename(self.name().clone()); ca } @@ -65,7 +65,7 @@ impl TimeChunked { } /// Construct a new [`TimeChunked`] from an iterator over [`NaiveTime`]. - pub fn from_naive_time>(name: &str, v: I) -> Self { + pub fn from_naive_time>(name: PlSmallStr, v: I) -> Self { let vals = v .into_iter() .map(|nt| time_to_time64ns(&nt)) @@ -75,7 +75,7 @@ impl TimeChunked { /// Construct a new [`TimeChunked`] from an iterator over optional [`NaiveTime`]. pub fn from_naive_time_options>>( - name: &str, + name: PlSmallStr, v: I, ) -> Self { let vals = v.into_iter().map(|opt| opt.map(|nt| time_to_time64ns(&nt))); diff --git a/crates/polars-core/src/chunked_array/trusted_len.rs b/crates/polars-core/src/chunked_array/trusted_len.rs index 84ff13cb906d..f65e640d1fc5 100644 --- a/crates/polars-core/src/chunked_array/trusted_len.rs +++ b/crates/polars-core/src/chunked_array/trusted_len.rs @@ -168,7 +168,7 @@ where { fn from_iter_trusted_length>(iter: I) -> Self { let arr = BinaryArray::from_iter_values(iter.into_iter()); - ChunkedArray::with_chunk("", arr) + ChunkedArray::with_chunk(PlSmallStr::const_default(), arr) } } @@ -179,7 +179,7 @@ where fn from_iter_trusted_length>>(iter: I) -> Self { let iter = iter.into_iter(); let arr = BinaryArray::from_iter(iter); - ChunkedArray::with_chunk("", arr) + ChunkedArray::with_chunk(PlSmallStr::const_default(), arr) } } diff --git a/crates/polars-core/src/datatypes/aliases.rs b/crates/polars-core/src/datatypes/aliases.rs index d30915a92104..4787b7fcd229 100644 --- a/crates/polars-core/src/datatypes/aliases.rs +++ b/crates/polars-core/src/datatypes/aliases.rs @@ -21,7 +21,7 @@ pub type IdxType = UInt32Type; #[cfg(feature = "bigidx")] pub type IdxType = UInt64Type; -pub use smartstring::alias::String as SmartString; +pub use polars_utils::pl_str::PlSmallStr; /// This hashmap uses an IdHasher pub type PlIdHashMap = hashbrown::HashMap; diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 4c7eeb093066..060371f97bbe 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -1,7 +1,7 @@ #[cfg(feature = "dtype-struct")] use arrow::legacy::trusted_len::TrustedLenPush; use arrow::types::PrimitiveType; -use polars_utils::format_smartstring; +use polars_utils::format_pl_smallstr; #[cfg(feature = "dtype-struct")] use polars_utils::slice::GetSaferUnchecked; #[cfg(feature = "dtype-categorical")] @@ -91,7 +91,7 @@ pub enum AnyValue<'a> { #[cfg(feature = "dtype-struct")] StructOwned(Box<(Vec>, Vec)>), /// An UTF8 encoded string type. - StringOwned(smartstring::alias::String), + StringOwned(PlSmallStr), Binary(&'a [u8]), BinaryOwned(Vec), /// A 128-bit fixed point decimal number with a scale. @@ -325,8 +325,8 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { AnyValue::List(value) }, (AvField::StringOwned, variant) => { - let value: String = variant.newtype_variant()?; - AnyValue::StringOwned(value.into()) + let value: PlSmallStr = variant.newtype_variant()?; + AnyValue::StringOwned(value) }, (AvField::BinaryOwned, variant) => { let value = variant.newtype_variant()?; @@ -343,7 +343,7 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { impl AnyValue<'static> { pub fn zero(dtype: &DataType) -> Self { match dtype { - DataType::String => AnyValue::StringOwned("".into()), + DataType::String => AnyValue::StringOwned(PlSmallStr::const_default()), DataType::Boolean => AnyValue::Boolean(false), // SAFETY: // Numeric values are static, inform the compiler of this. @@ -448,7 +448,7 @@ impl<'a> AnyValue<'a> { NumCast::from((*v).parse::().ok()?) } }, - StringOwned(v) => String(v).extract(), + StringOwned(v) => String(v.as_str()).extract(), _ => None, } } @@ -538,11 +538,11 @@ impl<'a> AnyValue<'a> { // to string (av, DataType::String) => { if av.is_unsigned_integer() { - AnyValue::StringOwned(format_smartstring!("{}", av.extract::()?)) + AnyValue::StringOwned(format_pl_smallstr!("{}", av.extract::()?)) } else if av.is_float() { - AnyValue::StringOwned(format_smartstring!("{}", av.extract::()?)) + AnyValue::StringOwned(format_pl_smallstr!("{}", av.extract::()?)) } else { - AnyValue::StringOwned(format_smartstring!("{}", av.extract::()?)) + AnyValue::StringOwned(format_pl_smallstr!("{}", av.extract::()?)) } }, @@ -844,7 +844,7 @@ impl<'a> AnyValue<'a> { pub fn as_borrowed(&self) -> AnyValue<'_> { match self { AnyValue::BinaryOwned(data) => AnyValue::Binary(data), - AnyValue::StringOwned(data) => AnyValue::String(data), + AnyValue::StringOwned(data) => AnyValue::String(data.as_str()), av => av.clone(), } } @@ -872,7 +872,7 @@ impl<'a> AnyValue<'a> { #[cfg(feature = "dtype-time")] Time(v) => Time(v), List(v) => List(v), - String(v) => StringOwned(v.into()), + String(v) => StringOwned(PlSmallStr::from_str(v)), StringOwned(v) => StringOwned(v), Binary(v) => BinaryOwned(v.to_vec()), BinaryOwned(v) => BinaryOwned(v), @@ -907,7 +907,7 @@ impl<'a> AnyValue<'a> { pub fn get_str(&self) -> Option<&str> { match self { AnyValue::String(s) => Some(s), - AnyValue::StringOwned(s) => Some(s), + AnyValue::StringOwned(s) => Some(s.as_str()), #[cfg(feature = "dtype-categorical")] AnyValue::Categorical(idx, rev, arr) | AnyValue::Enum(idx, rev, arr) => { let s = if arr.is_null() { @@ -951,8 +951,8 @@ impl AnyValue<'_> { (Float32(l), Float32(r)) => l.to_total_ord() == r.to_total_ord(), (Float64(l), Float64(r)) => l.to_total_ord() == r.to_total_ord(), (String(l), String(r)) => l == r, - (String(l), StringOwned(r)) => l == r, - (StringOwned(l), String(r)) => l == r, + (String(l), StringOwned(r)) => *l == r.as_str(), + (StringOwned(l), String(r)) => l.as_str() == *r, (StringOwned(l), StringOwned(r)) => l == r, (Boolean(l), Boolean(r)) => *l == *r, (Binary(l), Binary(r)) => l == r, @@ -1302,7 +1302,7 @@ mod test { DataType::Datetime(TimeUnit::Milliseconds, None), ), ( - ArrowDataType::Timestamp(ArrowTimeUnit::Second, Some("".to_string())), + ArrowDataType::Timestamp(ArrowTimeUnit::Second, Some(PlSmallStr::const_default())), DataType::Datetime(TimeUnit::Milliseconds, None), ), (ArrowDataType::LargeUtf8, DataType::String), @@ -1337,7 +1337,7 @@ mod test { (ArrowDataType::Time32(ArrowTimeUnit::Second), DataType::Time), ( ArrowDataType::List(Box::new(ArrowField::new( - "item", + PlSmallStr::from_static("item"), ArrowDataType::Float64, true, ))), @@ -1345,7 +1345,7 @@ mod test { ), ( ArrowDataType::LargeList(Box::new(ArrowField::new( - "item", + PlSmallStr::from_static("item"), ArrowDataType::Float64, true, ))), diff --git a/crates/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs index 0ae356f4b13e..5b0aed4baa3a 100644 --- a/crates/polars-core/src/datatypes/dtype.rs +++ b/crates/polars-core/src/datatypes/dtype.rs @@ -8,7 +8,7 @@ use super::*; use crate::chunked_array::object::registry::ObjectRegistry; use crate::utils::materialize_dyn_int; -pub type TimeZone = String; +pub type TimeZone = PlSmallStr; pub static DTYPE_ENUM_KEY: &str = "POLARS.CATEGORICAL_TYPE"; pub static DTYPE_ENUM_VALUE: &str = "ENUM"; @@ -152,14 +152,13 @@ impl Eq for DataType {} impl DataType { /// Standardize timezones to consistent values. - pub(crate) fn canonical_timezone(tz: &Option) -> Option { + pub(crate) fn canonical_timezone(tz: &Option) -> Option { match tz.as_deref() { - Some("") => None, + Some("") | None => None, #[cfg(feature = "timezones")] - Some("+00:00") | Some("00:00") | Some("utc") => Some("UTC"), - _ => tz.as_deref(), + Some("+00:00") | Some("00:00") | Some("utc") => Some(PlSmallStr::from_static("UTC")), + Some(v) => Some(PlSmallStr::from_str(v)), } - .map(|s| s.to_string()) } pub fn value_within_range(&self, other: AnyValue) -> bool { @@ -262,7 +261,7 @@ impl DataType { Struct(fields) => { let new_fields = fields .iter() - .map(|s| Field::new(s.name(), s.data_type().to_physical())) + .map(|s| Field::new(s.name().clone(), s.data_type().to_physical())) .collect(); Struct(new_fields) }, @@ -502,7 +501,7 @@ impl DataType { } /// Convert to an Arrow Field - pub fn to_arrow_field(&self, name: &str, compat_level: CompatLevel) -> ArrowField { + pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField { let metadata = match self { #[cfg(feature = "dtype-categorical")] DataType::Enum(_, _) => Some(BTreeMap::from([( @@ -510,8 +509,8 @@ impl DataType { DTYPE_ENUM_VALUE.into(), )])), DataType::BinaryOffset => Some(BTreeMap::from([( - "pl".to_string(), - "maintain_type".to_string(), + PlSmallStr::from_static("pl"), + PlSmallStr::from_static("maintain_type"), )])), _ => None, }; @@ -578,11 +577,11 @@ impl DataType { Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)), #[cfg(feature = "dtype-array")] Array(dt, size) => Ok(ArrowDataType::FixedSizeList( - Box::new(dt.to_arrow_field("item", compat_level)), + Box::new(dt.to_arrow_field(PlSmallStr::from_static("item"), compat_level)), *size, )), List(dt) => Ok(ArrowDataType::LargeList(Box::new( - dt.to_arrow_field("item", compat_level), + dt.to_arrow_field(PlSmallStr::from_static("item"), compat_level), ))), Null => Ok(ArrowDataType::Null), #[cfg(feature = "object")] diff --git a/crates/polars-core/src/datatypes/field.rs b/crates/polars-core/src/datatypes/field.rs index aea148546ef0..3f0f800ebdbf 100644 --- a/crates/polars-core/src/datatypes/field.rs +++ b/crates/polars-core/src/datatypes/field.rs @@ -1,4 +1,4 @@ -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use super::*; @@ -9,7 +9,7 @@ use super::*; derive(Serialize, Deserialize) )] pub struct Field { - pub name: SmartString, + pub name: PlSmallStr, pub dtype: DataType, } @@ -22,19 +22,12 @@ impl Field { /// /// ```rust /// # use polars_core::prelude::*; - /// let f1 = Field::new("Fruit name", DataType::String); - /// let f2 = Field::new("Lawful", DataType::Boolean); - /// let f2 = Field::new("Departure", DataType::Time); + /// let f1 = Field::new("Fruit name".into(), DataType::String); + /// let f2 = Field::new("Lawful".into(), DataType::Boolean); + /// let f2 = Field::new("Departure".into(), DataType::Time); /// ``` #[inline] - pub fn new(name: &str, dtype: DataType) -> Self { - Field { - name: name.into(), - dtype, - } - } - - pub fn from_owned(name: SmartString, dtype: DataType) -> Self { + pub fn new(name: PlSmallStr, dtype: DataType) -> Self { Field { name, dtype } } @@ -44,12 +37,12 @@ impl Field { /// /// ```rust /// # use polars_core::prelude::*; - /// let f = Field::new("Year", DataType::Int32); + /// let f = Field::new("Year".into(), DataType::Int32); /// /// assert_eq!(f.name(), "Year"); /// ``` #[inline] - pub fn name(&self) -> &SmartString { + pub fn name(&self) -> &PlSmallStr { &self.name } @@ -59,7 +52,7 @@ impl Field { /// /// ```rust /// # use polars_core::prelude::*; - /// let f = Field::new("Birthday", DataType::Date); + /// let f = Field::new("Birthday".into(), DataType::Date); /// /// assert_eq!(f.data_type(), &DataType::Date); /// ``` @@ -74,10 +67,10 @@ impl Field { /// /// ```rust /// # use polars_core::prelude::*; - /// let mut f = Field::new("Temperature", DataType::Int32); + /// let mut f = Field::new("Temperature".into(), DataType::Int32); /// f.coerce(DataType::Float32); /// - /// assert_eq!(f, Field::new("Temperature", DataType::Float32)); + /// assert_eq!(f, Field::new("Temperature".into(), DataType::Float32)); /// ``` pub fn coerce(&mut self, dtype: DataType) { self.dtype = dtype; @@ -89,12 +82,12 @@ impl Field { /// /// ```rust /// # use polars_core::prelude::*; - /// let mut f = Field::new("Atomic number", DataType::UInt32); + /// let mut f = Field::new("Atomic number".into(), DataType::UInt32); /// f.set_name("Proton".into()); /// - /// assert_eq!(f, Field::new("Proton", DataType::UInt32)); + /// assert_eq!(f, Field::new("Proton".into(), DataType::UInt32)); /// ``` - pub fn set_name(&mut self, name: SmartString) { + pub fn set_name(&mut self, name: PlSmallStr) { self.name = name; } @@ -104,13 +97,13 @@ impl Field { /// /// ```rust /// # use polars_core::prelude::*; - /// let f = Field::new("Value", DataType::Int64); - /// let af = arrow::datatypes::Field::new("Value", arrow::datatypes::ArrowDataType::Int64, true); + /// let f = Field::new("Value".into(), DataType::Int64); + /// let af = arrow::datatypes::Field::new("Value".into(), arrow::datatypes::ArrowDataType::Int64, true); /// /// assert_eq!(f.to_arrow(CompatLevel::newest()), af); /// ``` pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowField { - self.dtype.to_arrow_field(self.name.as_str(), compat_level) + self.dtype.to_arrow_field(self.name.clone(), compat_level) } } @@ -163,7 +156,7 @@ impl DataType { ArrowDataType::Struct(_) => { panic!("activate the 'dtype-struct' feature to handle struct data types") } - ArrowDataType::Extension(name, _, _) if name == "POLARS_EXTENSION_TYPE" => { + ArrowDataType::Extension(name, _, _) if name.as_str() == "POLARS_EXTENSION_TYPE" => { #[cfg(feature = "object")] { DataType::Object("extension", None) @@ -199,6 +192,6 @@ impl From<&ArrowDataType> for DataType { impl From<&ArrowField> for Field { fn from(f: &ArrowField) -> Self { - Field::new(&f.name, f.data_type().into()) + Field::new(f.name.clone(), f.data_type().into()) } } diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 066536ddec15..77513b242e65 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -1190,8 +1190,12 @@ mod test { #[test] fn test_fmt_list() { - let mut builder = - ListPrimitiveChunkedBuilder::::new("a", 10, 10, DataType::Int32); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("a"), + 10, + 10, + DataType::Int32, + ); builder.append_opt_slice(Some(&[1, 2, 3, 4, 5, 6])); builder.append_opt_slice(None); let list_long = builder.finish().into_series(); @@ -1266,8 +1270,12 @@ Series: 'a' [list[i32]] format!("{:?}", list_long) ); - let mut builder = - ListPrimitiveChunkedBuilder::::new("a", 10, 10, DataType::Int32); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("a"), + 10, + 10, + DataType::Int32, + ); builder.append_opt_slice(Some(&[1])); builder.append_opt_slice(None); let list_short = builder.finish().into_series(); @@ -1308,8 +1316,12 @@ Series: 'a' [list[i32]] format!("{:?}", list_short) ); - let mut builder = - ListPrimitiveChunkedBuilder::::new("a", 10, 10, DataType::Int32); + let mut builder = ListPrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("a"), + 10, + 10, + DataType::Int32, + ); builder.append_opt_slice(Some(&[])); builder.append_opt_slice(None); let list_empty = builder.finish().into_series(); @@ -1329,7 +1341,8 @@ Series: 'a' [list[i32]] #[test] fn test_fmt_temporal() { - let s = Int32Chunked::new("Date", &[Some(1), None, Some(3)]).into_date(); + let s = Int32Chunked::new(PlSmallStr::from_static("Date"), &[Some(1), None, Some(3)]) + .into_date(); assert_eq!( r#"shape: (3,) Series: 'Date' [date] @@ -1341,8 +1354,11 @@ Series: 'Date' [date] format!("{:?}", s.into_series()) ); - let s = Int64Chunked::new("", &[Some(1), None, Some(1_000_000_000_000)]) - .into_datetime(TimeUnit::Nanoseconds, None); + let s = Int64Chunked::new( + PlSmallStr::const_default(), + &[Some(1), None, Some(1_000_000_000_000)], + ) + .into_datetime(TimeUnit::Nanoseconds, None); assert_eq!( r#"shape: (3,) Series: '' [datetime[ns]] @@ -1357,7 +1373,7 @@ Series: '' [datetime[ns]] #[test] fn test_fmt_chunkedarray() { - let ca = Int32Chunked::new("Date", &[Some(1), None, Some(3)]); + let ca = Int32Chunked::new(PlSmallStr::from_static("Date"), &[Some(1), None, Some(3)]); assert_eq!( r#"shape: (3,) ChunkedArray: 'Date' [i32] @@ -1368,7 +1384,7 @@ ChunkedArray: 'Date' [i32] ]"#, format!("{:?}", ca) ); - let ca = StringChunked::new("name", &["a", "b"]); + let ca = StringChunked::new(PlSmallStr::from_static("name"), &["a", "b"]); assert_eq!( r#"shape: (2,) ChunkedArray: 'name' [str] diff --git a/crates/polars-core/src/frame/arithmetic.rs b/crates/polars-core/src/frame/arithmetic.rs index 887fedfb2d57..69e2279cd47f 100644 --- a/crates/polars-core/src/frame/arithmetic.rs +++ b/crates/polars-core/src/frame/arithmetic.rs @@ -151,7 +151,7 @@ impl DataFrame { // trick to fill a series with nulls let vals: &[Option] = &[None]; - let s = Series::new(name, vals).cast(dtype)?; + let s = Series::new(name.clone(), vals).cast(dtype)?; cols.push(s.new_from_index(0, max_len)) } } diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs index 906b18dcedb7..e63bc4d394e3 100644 --- a/crates/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -1,8 +1,8 @@ use arrow::offset::OffsetsBuffer; +use polars_utils::pl_str::PlSmallStr; use rayon::prelude::*; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use smartstring::alias::String as SmartString; use crate::chunked_array::ops::explode::offsets_to_indexes; use crate::prelude::*; @@ -22,10 +22,10 @@ fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer)> { #[derive(Clone, Default, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct UnpivotArgsIR { - pub on: Vec, - pub index: Vec, - pub variable_name: Option, - pub value_name: Option, + pub on: Vec, + pub index: Vec, + pub variable_name: Option, + pub value_name: Option, } impl DataFrame { @@ -39,15 +39,19 @@ impl DataFrame { return Ok(df); } columns.sort_by(|sa, sb| { - self.check_name_to_idx(sa.name()) + self.check_name_to_idx(sa.name().as_str()) .expect("checked above") - .partial_cmp(&self.check_name_to_idx(sb.name()).expect("checked above")) + .partial_cmp( + &self + .check_name_to_idx(sb.name().as_str()) + .expect("checked above"), + ) .expect("cmp usize -> Ordering") }); // first remove all the exploded columns for s in &columns { - df = df.drop(s.name())?; + df = df.drop(s.name().as_str())?; } let exploded_columns = POOL.install(|| { @@ -63,7 +67,7 @@ impl DataFrame { exploded: Series, ) -> PolarsResult<()> { if exploded.len() == df.height() || df.width() == 0 { - let col_idx = original_df.check_name_to_idx(exploded.name())?; + let col_idx = original_df.check_name_to_idx(exploded.name().as_str())?; df.columns.insert(col_idx, exploded); } else { polars_bail!( @@ -98,7 +102,7 @@ impl DataFrame { let (exploded, offsets) = &exploded_columns[0]; let row_idx = offsets_to_indexes(offsets.as_slice(), exploded.len()); - let mut row_idx = IdxCa::from_vec("", row_idx); + let mut row_idx = IdxCa::from_vec(PlSmallStr::const_default(), row_idx); row_idx.set_sorted_flag(IsSorted::Ascending); // SAFETY: @@ -123,13 +127,13 @@ impl DataFrame { /// /// ```ignore /// # use polars_core::prelude::*; - /// let s0 = Series::new("a", &[1i64, 2, 3]); - /// let s1 = Series::new("b", &[1i64, 1, 1]); - /// let s2 = Series::new("c", &[2i64, 2, 2]); + /// let s0 = Series::new("a".into(), &[1i64, 2, 3]); + /// let s1 = Series::new("b".into(), &[1i64, 1, 1]); + /// let s2 = Series::new("c".into(), &[2i64, 2, 2]); /// let list = Series::new("foo", &[s0, s1, s2]); /// - /// let s0 = Series::new("B", [1, 2, 3]); - /// let s1 = Series::new("C", [1, 1, 1]); + /// let s0 = Series::new("B".into(), [1, 2, 3]); + /// let s1 = Series::new("C".into(), [1, 1, 1]); /// let df = DataFrame::new(vec![list, s0, s1])?; /// let exploded = df.explode(["foo"])?; /// @@ -179,7 +183,7 @@ impl DataFrame { pub fn explode(&self, columns: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { // We need to sort the column by order of original occurrence. Otherwise the insert by index // below will panic @@ -196,13 +200,13 @@ mod test { #[cfg(feature = "dtype-i8")] #[cfg_attr(miri, ignore)] fn test_explode() { - let s0 = Series::new("a", &[1i8, 2, 3]); - let s1 = Series::new("b", &[1i8, 1, 1]); - let s2 = Series::new("c", &[2i8, 2, 2]); - let list = Series::new("foo", &[s0, s1, s2]); + let s0 = Series::new(PlSmallStr::from_static("a"), &[1i8, 2, 3]); + let s1 = Series::new(PlSmallStr::from_static("b"), &[1i8, 1, 1]); + let s2 = Series::new(PlSmallStr::from_static("c"), &[2i8, 2, 2]); + let list = Series::new(PlSmallStr::from_static("foo"), &[s0, s1, s2]); - let s0 = Series::new("B", [1, 2, 3]); - let s1 = Series::new("C", [1, 1, 1]); + let s0 = Series::new(PlSmallStr::from_static("B"), [1, 2, 3]); + let s1 = Series::new(PlSmallStr::from_static("C"), [1, 1, 1]); let df = DataFrame::new(vec![list, s0.clone(), s1.clone()]).unwrap(); let exploded = df.explode(["foo"]).unwrap(); assert_eq!(exploded.shape(), (9, 3)); @@ -217,11 +221,14 @@ mod test { #[test] #[cfg_attr(miri, ignore)] fn test_explode_df_empty_list() -> PolarsResult<()> { - let s0 = Series::new("a", &[1, 2, 3]); - let s1 = Series::new("b", &[1, 1, 1]); - let list = Series::new("foo", &[s0, s1.clone(), s1.clear()]); - let s0 = Series::new("B", [1, 2, 3]); - let s1 = Series::new("C", [1, 1, 1]); + let s0 = Series::new(PlSmallStr::from_static("a"), &[1, 2, 3]); + let s1 = Series::new(PlSmallStr::from_static("b"), &[1, 1, 1]); + let list = Series::new( + PlSmallStr::from_static("foo"), + &[s0, s1.clone(), s1.clear()], + ); + let s0 = Series::new(PlSmallStr::from_static("B"), [1, 2, 3]); + let s1 = Series::new(PlSmallStr::from_static("C"), [1, 1, 1]); let df = DataFrame::new(vec![list, s0.clone(), s1.clone()])?; let out = df.explode(["foo"])?; @@ -233,7 +240,10 @@ mod test { assert!(out.equals_missing(&expected)); - let list = Series::new("foo", [s0.clone(), s1.clear(), s1.clone()]); + let list = Series::new( + PlSmallStr::from_static("foo"), + [s0.clone(), s1.clear(), s1.clone()], + ); let df = DataFrame::new(vec![list, s0, s1])?; let out = df.explode(["foo"])?; let expected = df![ @@ -249,9 +259,9 @@ mod test { #[test] #[cfg_attr(miri, ignore)] fn test_explode_single_col() -> PolarsResult<()> { - let s0 = Series::new("a", &[1i32, 2, 3]); - let s1 = Series::new("b", &[1i32, 1, 1]); - let list = Series::new("foo", &[s0, s1]); + let s0 = Series::new(PlSmallStr::from_static("a"), &[1i32, 2, 3]); + let s1 = Series::new(PlSmallStr::from_static("b"), &[1i32, 1, 1]); + let list = Series::new(PlSmallStr::from_static("foo"), &[s0, s1]); let df = DataFrame::new(vec![list])?; let out = df.explode(["foo"])?; diff --git a/crates/polars-core/src/frame/from.rs b/crates/polars-core/src/frame/from.rs index 607fab946857..2af97c569942 100644 --- a/crates/polars-core/src/frame/from.rs +++ b/crates/polars-core/src/frame/from.rs @@ -17,7 +17,7 @@ impl TryFrom for DataFrame { // reported data type is correct unsafe { Series::_try_from_arrow_unchecked_with_md( - &fld.name, + fld.name.clone(), vec![arr], fld.data_type(), Some(&fld.metadata), diff --git a/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs b/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs index ef57f7bb953c..faa3f72efc9c 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs @@ -86,7 +86,7 @@ where None, ); - let mut ca = ListChunked::with_chunk(self.name(), arr); + let mut ca = ListChunked::with_chunk(self.name().clone(), arr); if can_fast_explode { ca.set_fast_explode() } @@ -148,7 +148,7 @@ where Box::new(array), None, ); - let mut ca = ListChunked::with_chunk(self.name(), arr); + let mut ca = ListChunked::with_chunk(self.name().clone(), arr); if can_fast_explode { ca.set_fast_explode() } @@ -162,14 +162,14 @@ impl AggList for NullChunked { unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { match groups { GroupsProxy::Idx(groups) => { - let mut builder = ListNullChunkedBuilder::new(self.name(), groups.len()); + let mut builder = ListNullChunkedBuilder::new(self.name().clone(), groups.len()); for idx in groups.all().iter() { builder.append_with_len(idx.len()); } builder.finish().into_series() }, GroupsProxy::Slice { groups, .. } => { - let mut builder = ListNullChunkedBuilder::new(self.name(), groups.len()); + let mut builder = ListNullChunkedBuilder::new(self.name().clone(), groups.len()); for [_, len] in groups { builder.append_with_len(*len as usize); } @@ -269,7 +269,7 @@ impl AggList for ObjectChunked { extension_array, None, ); - let mut listarr = ListChunked::with_chunk(self.name(), arr); + let mut listarr = ListChunked::with_chunk(self.name().clone(), arr); if can_fast_explode { listarr.set_fast_explode() } @@ -293,8 +293,10 @@ impl AggList for StructChunked { let arr = gathered.chunks()[0].clone(); let dtype = LargeListArray::default_datatype(arr.data_type().clone()); - let mut chunk = - ListChunked::with_chunk(self.name(), LargeListArray::new(dtype, offsets, arr, None)); + let mut chunk = ListChunked::with_chunk( + self.name().clone(), + LargeListArray::new(dtype, offsets, arr, None), + ); chunk.set_dtype(DataType::List(Box::new(self.dtype().clone()))); if can_fast_explode { chunk.set_fast_explode() @@ -322,8 +324,10 @@ where let arr = gathered.chunks()[0].clone(); let dtype = LargeListArray::default_datatype(arr.data_type().clone()); - let mut chunk = - ListChunked::with_chunk(ca.name(), LargeListArray::new(dtype, offsets, arr, None)); + let mut chunk = ListChunked::with_chunk( + ca.name().clone(), + LargeListArray::new(dtype, offsets, arr, None), + ); chunk.set_dtype(DataType::List(Box::new(ca.dtype().clone()))); if can_fast_explode { chunk.set_fast_explode() diff --git a/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs index 447352a0faaf..747bb3035ccc 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs @@ -73,7 +73,7 @@ impl Series { } }, ) - .collect_ca(""); + .collect_ca(PlSmallStr::const_default()); // SAFETY: groups are always in bounds. s.take_unchecked(&indices) }, @@ -81,7 +81,7 @@ impl Series { let indices = groups .iter() .map(|&[first, len]| if len == 0 { None } else { Some(first) }) - .collect_ca(""); + .collect_ca(PlSmallStr::const_default()); // SAFETY: groups are always in bounds. s.take_unchecked(&indices) }, @@ -175,7 +175,7 @@ impl Series { * (MS_IN_DAY as f64)) .cast(&Datetime(TimeUnit::Milliseconds, None)) .unwrap(), - _ => Series::full_null("", groups.len(), s.dtype()), + _ => Series::full_null(PlSmallStr::const_default(), groups.len(), s.dtype()), } } @@ -227,7 +227,7 @@ impl Series { * (MS_IN_DAY as f64)) .cast(&Datetime(TimeUnit::Milliseconds, None)) .unwrap(), - _ => Series::full_null("", groups.len(), s.dtype()), + _ => Series::full_null(PlSmallStr::const_default(), groups.len(), s.dtype()), } } @@ -262,7 +262,7 @@ impl Series { s } }, - _ => Series::full_null("", groups.len(), s.dtype()), + _ => Series::full_null(PlSmallStr::const_default(), groups.len(), s.dtype()), } } @@ -287,7 +287,7 @@ impl Series { Some(idx[idx.len() - 1]) } }) - .collect_ca(""); + .collect_ca(PlSmallStr::const_default()); s.take_unchecked(&indices) }, GroupsProxy::Slice { groups, .. } => { @@ -300,7 +300,7 @@ impl Series { Some(first + len - 1) } }) - .collect_ca(""); + .collect_ca(PlSmallStr::const_default()); s.take_unchecked(&indices) }, }; diff --git a/crates/polars-core/src/frame/group_by/aggregations/mod.rs b/crates/polars-core/src/frame/group_by/aggregations/mod.rs index bd805d4bd332..f30f60089750 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/mod.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/mod.rs @@ -359,7 +359,7 @@ where { let invalid_quantile = !(0.0..=1.0).contains(&quantile); if invalid_quantile { - return Series::full_null(ca.name(), groups.len(), ca.dtype()); + return Series::full_null(ca.name().clone(), groups.len(), ca.dtype()); } match groups { GroupsProxy::Idx(groups) => { diff --git a/crates/polars-core/src/frame/group_by/expr.rs b/crates/polars-core/src/frame/group_by/expr.rs index 160f9e81f8d3..f35a04a5664f 100644 --- a/crates/polars-core/src/frame/group_by/expr.rs +++ b/crates/polars-core/src/frame/group_by/expr.rs @@ -4,5 +4,5 @@ pub trait PhysicalAggExpr { #[allow(clippy::ptr_arg)] fn evaluate(&self, df: &DataFrame, groups: &GroupsProxy) -> PolarsResult; - fn root_name(&self) -> PolarsResult<&str>; + fn root_name(&self) -> PolarsResult<&PlSmallStr>; } diff --git a/crates/polars-core/src/frame/group_by/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs index 47f585006e81..f9eb08e01b18 100644 --- a/crates/polars-core/src/frame/group_by/into_groups.rs +++ b/crates/polars-core/src/frame/group_by/into_groups.rs @@ -324,7 +324,7 @@ impl IntoGroupsProxy for ListChunked { let ca = if multithreaded { encode_rows_vertical_par_unordered(by).unwrap() } else { - _get_rows_encoded_ca_unordered("", by).unwrap() + _get_rows_encoded_ca_unordered(PlSmallStr::const_default(), by).unwrap() }; ca.group_tuples(multithreaded, sorted) diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 5206240a3261..3675fb9545ce 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -2,6 +2,7 @@ use std::fmt::{Debug, Display, Formatter}; use std::hash::Hash; use num_traits::NumCast; +use polars_utils::format_pl_smallstr; use polars_utils::hashing::DirtyHash; use rayon::prelude::*; @@ -113,7 +114,7 @@ impl DataFrame { pub fn group_by(&self, by: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { let selected_keys = self.select_series(by)?; self.group_by_with_series(selected_keys, true, false) @@ -124,7 +125,7 @@ impl DataFrame { pub fn group_by_stable(&self, by: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { let selected_keys = self.select_series(by)?; self.group_by_with_series(selected_keys, true, true) @@ -152,9 +153,9 @@ impl DataFrame { /// let s0 = DateChunked::parse_from_str_slice("date", dates, fmt) /// .into_series(); /// // create temperature series -/// let s1 = Series::new("temp", [20, 10, 7, 9, 1]); +/// let s1 = Series::new("temp".into(), [20, 10, 7, 9, 1]); /// // create rain series -/// let s2 = Series::new("rain", [0.2, 0.1, 0.3, 0.1, 0.01]); +/// let s2 = Series::new("rain".into(), [0.2, 0.1, 0.3, 0.1, 0.01]); /// // create a new DataFrame /// let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); /// println!("{:?}", df); @@ -187,7 +188,7 @@ pub struct GroupBy<'df> { // [first idx, [other idx]] groups: GroupsProxy, // columns selected for aggregation - pub(crate) selected_agg: Option>, + pub(crate) selected_agg: Option>, } impl<'df> GroupBy<'df> { @@ -195,7 +196,7 @@ impl<'df> GroupBy<'df> { df: &'df DataFrame, by: Vec, groups: GroupsProxy, - selected_agg: Option>, + selected_agg: Option>, ) -> Self { GroupBy { df, @@ -211,13 +212,8 @@ impl<'df> GroupBy<'df> { /// Note that making a selection with this method is not required. If you /// skip it all columns (except for the keys) will be selected for aggregation. #[must_use] - pub fn select, S: AsRef>(mut self, selection: I) -> Self { - self.selected_agg = Some( - selection - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect(), - ); + pub fn select, S: Into>(mut self, selection: I) -> Self { + self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect()); self } @@ -285,7 +281,10 @@ impl<'df> GroupBy<'df> { ); } - let indices = groups.iter().map(|&[first, _len]| first).collect_ca(""); + let indices = groups + .iter() + .map(|&[first, _len]| first) + .collect_ca(PlSmallStr::const_default()); // SAFETY: groups are always in bounds. let mut out = unsafe { s.take_unchecked(&indices) }; // Sliced groups are always in order of discovery. @@ -303,21 +302,24 @@ impl<'df> GroupBy<'df> { } fn prepare_agg(&self) -> PolarsResult<(Vec, Vec)> { - let selection = match &self.selected_agg { - Some(selection) => selection.clone(), + let keys = self.keys(); + + let agg_col = match &self.selected_agg { + Some(selection) => self.df.select_series_impl(selection.as_slice()), None => { let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect(); - self.df - .get_column_names() - .into_iter() + let selection = self + .df + .iter() + .map(|s| s.name()) .filter(|a| !by.contains(a)) - .map(|s| s.to_string()) - .collect() + .cloned() + .collect::>(); + + self.df.select_series_impl(selection.as_slice()) }, - }; + }?; - let keys = self.keys(); - let agg_col = self.df.select_series(selection)?; Ok((keys, agg_col)) } @@ -351,9 +353,9 @@ impl<'df> GroupBy<'df> { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Mean); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean); let mut agg = unsafe { agg_col.agg_mean(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -389,9 +391,9 @@ impl<'df> GroupBy<'df> { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Sum); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum); let mut agg = unsafe { agg_col.agg_sum(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -426,9 +428,9 @@ impl<'df> GroupBy<'df> { pub fn min(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Min); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min); let mut agg = unsafe { agg_col.agg_min(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -463,9 +465,9 @@ impl<'df> GroupBy<'df> { pub fn max(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Max); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max); let mut agg = unsafe { agg_col.agg_max(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -500,9 +502,9 @@ impl<'df> GroupBy<'df> { pub fn first(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::First); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First); let mut agg = unsafe { agg_col.agg_first(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -537,9 +539,9 @@ impl<'df> GroupBy<'df> { pub fn last(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Last); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last); let mut agg = unsafe { agg_col.agg_last(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -574,9 +576,9 @@ impl<'df> GroupBy<'df> { pub fn n_unique(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::NUnique); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique); let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg.into_series()); } DataFrame::new(cols) @@ -606,10 +608,12 @@ impl<'df> GroupBy<'df> { ); let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = - fmt_group_by_column(agg_col.name(), GroupByMethod::Quantile(quantile, interpol)); + let new_name = fmt_group_by_column( + agg_col.name().as_str(), + GroupByMethod::Quantile(quantile, interpol), + ); let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, interpol) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg.into_series()); } DataFrame::new(cols) @@ -629,9 +633,9 @@ impl<'df> GroupBy<'df> { pub fn median(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Median); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median); let mut agg = unsafe { agg_col.agg_median(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg.into_series()); } DataFrame::new(cols) @@ -642,9 +646,9 @@ impl<'df> GroupBy<'df> { pub fn var(&self, ddof: u8) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Var(ddof)); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof)); let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg.into_series()); } DataFrame::new(cols) @@ -655,9 +659,9 @@ impl<'df> GroupBy<'df> { pub fn std(&self, ddof: u8) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Std(ddof)); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof)); let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg.into_series()); } DataFrame::new(cols) @@ -693,13 +697,13 @@ impl<'df> GroupBy<'df> { for agg_col in agg_cols { let new_name = fmt_group_by_column( - agg_col.name(), + agg_col.name().as_str(), GroupByMethod::Count { include_nulls: true, }, ); let mut ca = self.groups.group_count(); - ca.rename(&new_name); + ca.rename(new_name); cols.push(ca.into_series()); } DataFrame::new(cols) @@ -734,7 +738,7 @@ impl<'df> GroupBy<'df> { let mut cols = self.keys(); let mut column = self.groups.as_list_chunked(); let new_name = fmt_group_by_column("", GroupByMethod::Groups); - column.rename(&new_name); + column.rename(new_name); cols.push(column.into_series()); DataFrame::new(cols) } @@ -769,9 +773,9 @@ impl<'df> GroupBy<'df> { pub fn agg_list(&self) -> PolarsResult { let (mut cols, agg_cols) = self.prepare_agg()?; for agg_col in agg_cols { - let new_name = fmt_group_by_column(agg_col.name(), GroupByMethod::Implode); + let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Implode); let mut agg = unsafe { agg_col.agg_list(&self.groups) }; - agg.rename(&new_name); + agg.rename(new_name); cols.push(agg); } DataFrame::new(cols) @@ -785,7 +789,7 @@ impl<'df> GroupBy<'df> { } else { let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len()); new_cols.extend_from_slice(&self.selected_keys); - let cols = self.df.select_series(agg)?; + let cols = self.df.select_series_impl(agg.as_slice())?; new_cols.extend(cols); Ok(unsafe { DataFrame::new_no_checks(new_cols) }) } @@ -893,25 +897,25 @@ impl Display for GroupByMethod { } // Formatting functions used in eager and lazy code for renaming grouped columns -pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> String { +pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr { use GroupByMethod::*; match method { - Min => format!("{name}_min"), - Max => format!("{name}_max"), - NanMin => format!("{name}_nan_min"), - NanMax => format!("{name}_nan_max"), - Median => format!("{name}_median"), - Mean => format!("{name}_mean"), - First => format!("{name}_first"), - Last => format!("{name}_last"), - Sum => format!("{name}_sum"), - Groups => "groups".to_string(), - NUnique => format!("{name}_n_unique"), - Count { .. } => format!("{name}_count"), - Implode => format!("{name}_agg_list"), - Quantile(quantile, _interpol) => format!("{name}_quantile_{quantile:.2}"), - Std(_) => format!("{name}_agg_std"), - Var(_) => format!("{name}_agg_var"), + Min => format_pl_smallstr!("{name}_min"), + Max => format_pl_smallstr!("{name}_max"), + NanMin => format_pl_smallstr!("{name}_nan_min"), + NanMax => format_pl_smallstr!("{name}_nan_max"), + Median => format_pl_smallstr!("{name}_median"), + Mean => format_pl_smallstr!("{name}_mean"), + First => format_pl_smallstr!("{name}_first"), + Last => format_pl_smallstr!("{name}_last"), + Sum => format_pl_smallstr!("{name}_sum"), + Groups => PlSmallStr::from_static("groups"), + NUnique => format_pl_smallstr!("{name}_n_unique"), + Count { .. } => format_pl_smallstr!("{name}_count"), + Implode => format_pl_smallstr!("{name}_agg_list"), + Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"), + Std(_) => format_pl_smallstr!("{name}_agg_std"), + Var(_) => format_pl_smallstr!("{name}_agg_var"), } } @@ -926,7 +930,7 @@ mod test { #[cfg_attr(miri, ignore)] fn test_group_by() -> PolarsResult<()> { let s0 = Series::new( - "date", + PlSmallStr::from_static("date"), &[ "2020-08-21", "2020-08-21", @@ -935,14 +939,14 @@ mod test { "2020-08-22", ], ); - let s1 = Series::new("temp", [20, 10, 7, 9, 1]); - let s2 = Series::new("rain", [0.2, 0.1, 0.3, 0.1, 0.01]); + let s1 = Series::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]); + let s2 = Series::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let out = df.group_by_stable(["date"])?.select(["temp"]).count()?; assert_eq!( out.column("temp_count")?, - &Series::new("temp_count", [2 as IdxSize, 2, 1]) + &Series::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1]) ); // Use of deprecated mean() for testing purposes @@ -954,7 +958,7 @@ mod test { .mean()?; assert_eq!( out.column("temp_mean")?, - &Series::new("temp_mean", [15.0f64, 4.0, 9.0]) + &Series::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0]) ); // Use of deprecated `mean()` for testing purposes @@ -971,7 +975,7 @@ mod test { let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?; assert_eq!( out.column("temp_sum")?, - &Series::new("temp_sum", [30, 8, 9]) + &Series::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9]) ); // Use of deprecated `n_unique()` for testing purposes @@ -987,19 +991,19 @@ mod test { #[cfg_attr(miri, ignore)] fn test_static_group_by_by_12_columns() { // Build GroupBy DataFrame. - let s0 = Series::new("G1", ["A", "A", "B", "B", "C"].as_ref()); - let s1 = Series::new("N", [1, 2, 2, 4, 2].as_ref()); - let s2 = Series::new("G2", ["k", "l", "m", "m", "l"].as_ref()); - let s3 = Series::new("G3", ["a", "b", "c", "c", "d"].as_ref()); - let s4 = Series::new("G4", ["1", "2", "3", "3", "4"].as_ref()); - let s5 = Series::new("G5", ["X", "Y", "Z", "Z", "W"].as_ref()); - let s6 = Series::new("G6", [false, true, true, true, false].as_ref()); - let s7 = Series::new("G7", ["r", "x", "q", "q", "o"].as_ref()); - let s8 = Series::new("G8", ["R", "X", "Q", "Q", "O"].as_ref()); - let s9 = Series::new("G9", [1, 2, 3, 3, 4].as_ref()); - let s10 = Series::new("G10", [".", "!", "?", "?", "/"].as_ref()); - let s11 = Series::new("G11", ["(", ")", "@", "@", "$"].as_ref()); - let s12 = Series::new("G12", ["-", "_", ";", ";", ","].as_ref()); + let s0 = Series::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref()); + let s1 = Series::new("N".into(), [1, 2, 2, 4, 2].as_ref()); + let s2 = Series::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref()); + let s3 = Series::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref()); + let s4 = Series::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref()); + let s5 = Series::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref()); + let s6 = Series::new("G6".into(), [false, true, true, true, false].as_ref()); + let s7 = Series::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref()); + let s8 = Series::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref()); + let s9 = Series::new("G9".into(), [1, 2, 3, 3, 4].as_ref()); + let s10 = Series::new("G10".into(), [".", "!", "?", "?", "/"].as_ref()); + let s11 = Series::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref()); + let s12 = Series::new("G12".into(), ["-", "_", ";", ";", ","].as_ref()); let df = DataFrame::new(vec![s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]).unwrap(); @@ -1036,13 +1040,13 @@ mod test { let mut series = Vec::with_capacity(14); // Create a series for every group name. - for series_name in &series_names { - let group_series = Series::new(series_name, series_content.as_ref()); + for series_name in series_names { + let group_series = Series::new(series_name.into(), series_content.as_ref()); series.push(group_series); } // Create a series for the aggregation column. - let agg_series = Series::new("N", [1, 2, 3, 3, 4].as_ref()); + let agg_series = Series::new("N".into(), [1, 2, 3, 3, 4].as_ref()); series.push(agg_series); // Create the dataframe with the computed series. diff --git a/crates/polars-core/src/frame/group_by/proxy.rs b/crates/polars-core/src/frame/group_by/proxy.rs index 63f37368b756..aa6b327fa547 100644 --- a/crates/polars-core/src/frame/group_by/proxy.rs +++ b/crates/polars-core/src/frame/group_by/proxy.rs @@ -345,7 +345,7 @@ impl GroupsProxy { } unsafe { ( - Some(IdxCa::from_vec("", gather_offsets)), + Some(IdxCa::from_vec(PlSmallStr::const_default(), gather_offsets)), OffsetsBuffer::new_unchecked(list_offset.into()), can_fast_explode, ) @@ -369,7 +369,7 @@ impl GroupsProxy { unsafe { ( - Some(IdxCa::from_vec("", gather_offsets)), + Some(IdxCa::from_vec(PlSmallStr::const_default(), gather_offsets)), OffsetsBuffer::new_unchecked(list_offset.into()), can_fast_explode, ) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 6ed4e8bbb356..bcbf486e0877 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -3,11 +3,11 @@ use polars_utils::aliases::PlHashSet; use crate::datatypes::AnyValue; use crate::frame::DataFrame; -use crate::prelude::{Series, SmartString}; +use crate::prelude::{PlSmallStr, Series}; -fn check_hstack<'a>( - col: &'a Series, - names: &mut PlHashSet<&'a str>, +fn check_hstack( + col: &Series, + names: &mut PlHashSet, height: usize, is_empty: bool, ) -> PolarsResult<()> { @@ -17,8 +17,8 @@ fn check_hstack<'a>( col.len(), height, ); polars_ensure!( - names.insert(col.name()), - Duplicate: "unable to hstack, column with name {:?} already exists", col.name(), + names.insert(col.name().clone()), + Duplicate: "unable to hstack, column with name {:?} already exists", col.name().as_str(), ); Ok(()) } @@ -50,7 +50,7 @@ impl DataFrame { let mut names = self .columns .iter() - .map(|c| c.name()) + .map(|c| c.name().clone()) .collect::>(); let height = self.height(); @@ -99,15 +99,12 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars let height = first_df.height(); let is_empty = first_df.is_empty(); - let columns; let mut names = if check_duplicates { - columns = first_df + first_df .columns .iter() - .map(|s| SmartString::from(s.name())) - .collect::>(); - - columns.iter().map(|n| n.as_str()).collect::>() + .map(|s| s.name().clone()) + .collect::>() } else { Default::default() }; diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 1923d370ebc1..d5e0d2bd9d4a 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -27,9 +27,9 @@ mod top_k; mod upstream_traits; use arrow::record_batch::RecordBatch; +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use smartstring::alias::String as SmartString; use crate::chunked_array::cast::CastOptions; #[cfg(feature = "row_hash")] @@ -63,7 +63,7 @@ pub enum UniqueKeepStrategy { fn ensure_names_unique(items: &[T], mut get_name: F) -> PolarsResult<()> where - F: FnMut(&T) -> &str, + F: for<'a> FnMut(&'a T) -> &'a str, { // Always unique. if items.len() <= 1 { @@ -120,8 +120,8 @@ where /// /// ```rust /// # use polars_core::prelude::*; -/// let s1 = Series::new("Fruit", &["Apple", "Apple", "Pear"]); -/// let s2 = Series::new("Color", &["Red", "Yellow", "Green"]); +/// let s1 = Series::new("Fruit".into(), &["Apple", "Apple", "Pear"]); +/// let s2 = Series::new("Color".into(), &["Red", "Yellow", "Green"]); /// /// let df: PolarsResult = DataFrame::new(vec![s1, s2]); /// ``` @@ -150,8 +150,8 @@ where /// let df = df!("Fruit" => &["Apple", "Apple", "Pear"], /// "Color" => &["Red", "Yellow", "Green"])?; /// -/// assert_eq!(df[0], Series::new("Fruit", &["Apple", "Apple", "Pear"])); -/// assert_eq!(df[1], Series::new("Color", &["Red", "Yellow", "Green"])); +/// assert_eq!(df[0], Series::new("Fruit".into(), &["Apple", "Apple", "Pear"])); +/// assert_eq!(df[1], Series::new("Color".into(), &["Red", "Yellow", "Green"])); /// # Ok::<(), PolarsError>(()) /// ``` /// @@ -162,8 +162,8 @@ where /// let df = df!("Fruit" => &["Apple", "Apple", "Pear"], /// "Color" => &["Red", "Yellow", "Green"])?; /// -/// assert_eq!(df["Fruit"], Series::new("Fruit", &["Apple", "Apple", "Pear"])); -/// assert_eq!(df["Color"], Series::new("Color", &["Red", "Yellow", "Green"])); +/// assert_eq!(df["Fruit"], Series::new("Fruit".into(), &["Apple", "Apple", "Pear"])); +/// assert_eq!(df["Color"], Series::new("Color".into(), &["Red", "Yellow", "Green"])); /// # Ok::<(), PolarsError>(()) /// ``` #[derive(Clone)] @@ -225,7 +225,7 @@ impl DataFrame { fn check_already_present(&self, name: &str) -> PolarsResult<()> { polars_ensure!( - self.columns.iter().all(|s| s.name() != name), + self.columns.iter().all(|s| s.name().as_str() != name), Duplicate: "column with name {:?} is already present in the DataFrame", name ); Ok(()) @@ -246,14 +246,14 @@ impl DataFrame { /// /// ``` /// # use polars_core::prelude::*; - /// let s0 = Series::new("days", [0, 1, 2].as_ref()); - /// let s1 = Series::new("temp", [22.1, 19.9, 7.].as_ref()); + /// let s0 = Series::new("days".into(), [0, 1, 2].as_ref()); + /// let s1 = Series::new("temp".into(), [22.1, 19.9, 7.].as_ref()); /// /// let df = DataFrame::new(vec![s0, s1])?; /// # Ok::<(), PolarsError>(()) /// ``` pub fn new(columns: Vec) -> PolarsResult { - ensure_names_unique(&columns, |s| s.name())?; + ensure_names_unique(&columns, |s| s.name().as_str())?; if columns.len() > 1 { let first_len = columns[0].len(); @@ -272,7 +272,7 @@ impl DataFrame { /// Converts a sequence of columns into a DataFrame, broadcasting length-1 /// columns to match the other columns. pub fn new_with_broadcast(columns: Vec) -> PolarsResult { - ensure_names_unique(&columns, |s| s.name())?; + ensure_names_unique(&columns, |s| s.name().as_str())?; unsafe { Self::new_with_broadcast_no_checks(columns) } } @@ -327,7 +327,7 @@ impl DataFrame { pub fn empty_with_schema(schema: &Schema) -> Self { let cols = schema .iter() - .map(|(name, dtype)| Series::new_empty(name, dtype)) + .map(|(name, dtype)| Series::new_empty(name.clone(), dtype)) .collect(); unsafe { DataFrame::new_no_checks(cols) } } @@ -337,7 +337,7 @@ impl DataFrame { let cols = schema .fields .iter() - .map(|fld| Series::new_empty(fld.name.as_str(), &(fld.data_type().into()))) + .map(|fld| Series::new_empty(fld.name.clone(), &(fld.data_type().into()))) .collect(); unsafe { DataFrame::new_no_checks(cols) } } @@ -348,8 +348,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s1 = Series::new("Ocean", &["Atlantic", "Indian"]); - /// let s2 = Series::new("Area (km²)", &[106_460_000, 70_560_000]); + /// let s1 = Series::new("Ocean".into(), &["Atlantic", "Indian"]); + /// let s2 = Series::new("Area (km²)".into(), &[106_460_000, 70_560_000]); /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?; /// /// assert_eq!(df.pop(), Some(s2)); @@ -371,7 +371,7 @@ impl DataFrame { /// let df1: DataFrame = df!("Name" => &["James", "Mary", "John", "Patricia"])?; /// assert_eq!(df1.shape(), (4, 1)); /// - /// let df2: DataFrame = df1.with_row_index("Id", None)?; + /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?; /// assert_eq!(df2.shape(), (4, 2)); /// println!("{}", df2); /// @@ -396,7 +396,7 @@ impl DataFrame { /// | 3 | Patricia | /// +-----+----------+ /// ``` - pub fn with_row_index(&self, name: &str, offset: Option) -> PolarsResult { + pub fn with_row_index(&self, name: PlSmallStr, offset: Option) -> PolarsResult { let mut columns = Vec::with_capacity(self.columns.len() + 1); let offset = offset.unwrap_or(0); @@ -412,7 +412,7 @@ impl DataFrame { } /// Add a row index column in place. - pub fn with_row_index_mut(&mut self, name: &str, offset: Option) -> &mut Self { + pub fn with_row_index_mut(&mut self, name: PlSmallStr, offset: Option) -> &mut Self { let offset = offset.unwrap_or(0); let mut ca = IdxCa::from_vec( name, @@ -446,7 +446,7 @@ impl DataFrame { /// It is the callers responsibility to uphold the contract of all `Series` /// having an equal length, if not this may panic down the line. pub unsafe fn new_no_length_checks(columns: Vec) -> PolarsResult { - ensure_names_unique(&columns, |s| s.name())?; + ensure_names_unique(&columns, |s| s.name().as_str())?; Ok(DataFrame { columns }) } @@ -531,8 +531,8 @@ impl DataFrame { /// let df: DataFrame = df!("Thing" => &["Observable universe", "Human stupidity"], /// "Diameter (m)" => &[8.8e26, f64::INFINITY])?; /// - /// let f1: Field = Field::new("Thing", DataType::String); - /// let f2: Field = Field::new("Diameter (m)", DataType::Float64); + /// let f1: Field = Field::new("Thing".into(), DataType::String); + /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64); /// let sc: Schema = Schema::from_iter(vec![f1, f2]); /// /// assert_eq!(df.schema(), sc); @@ -581,8 +581,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s1: Series = Series::new("Name", &["Pythagoras' theorem", "Shannon entropy"]); - /// let s2: Series = Series::new("Formula", &["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]); + /// let s1: Series = Series::new("Name".into(), &["Pythagoras' theorem", "Shannon entropy"]); + /// let s2: Series = Series::new("Formula".into(), &["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]); /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?; /// /// let mut iterator = df.iter(); @@ -606,13 +606,17 @@ impl DataFrame { /// assert_eq!(df.get_column_names(), &["Language", "Designer"]); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn get_column_names(&self) -> Vec<&str> { + pub fn get_column_names(&self) -> Vec<&PlSmallStr> { self.columns.iter().map(|s| s.name()).collect() } - /// Get the [`Vec`] representing the column names. - pub fn get_column_names_owned(&self) -> Vec { - self.columns.iter().map(|s| s.name().into()).collect() + /// Get the [`Vec`] representing the column names. + pub fn get_column_names_owned(&self) -> Vec { + self.columns.iter().map(|s| s.name().clone()).collect() + } + + pub fn get_column_names_str(&self) -> Vec<&str> { + self.columns.iter().map(|s| s.name().as_str()).collect() } /// Set the column names. @@ -626,13 +630,22 @@ impl DataFrame { /// assert_eq!(df.get_column_names(), &["Set"]); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn set_column_names>(&mut self, names: &[S]) -> PolarsResult<()> { + pub fn set_column_names(&mut self, names: I) -> PolarsResult<()> + where + I: IntoIterator, + S: Into, + { + let names = names.into_iter().map(Into::into).collect::>(); + self._set_column_names_impl(names.as_slice()) + } + + fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> { polars_ensure!( names.len() == self.width(), ShapeMismatch: "{} column names provided for a DataFrame of width {}", names.len(), self.width() ); - ensure_names_unique(names, |s| s.as_ref())?; + ensure_names_unique(names, |s| s.as_str())?; let columns = mem::take(&mut self.columns); self.columns = columns @@ -640,7 +653,7 @@ impl DataFrame { .zip(names) .map(|(s, name)| { let mut s = s; - s.rename(name.as_ref()); + s.rename(name.clone()); s }) .collect(); @@ -680,8 +693,8 @@ impl DataFrame { /// let earth: DataFrame = df!("Surface type" => &["Water", "Land"], /// "Fraction" => &[0.708, 0.292])?; /// - /// let f1: Field = Field::new("Surface type", DataType::String); - /// let f2: Field = Field::new("Fraction", DataType::Float64); + /// let f1: Field = Field::new("Surface type".into(), DataType::String); + /// let f2: Field = Field::new("Fraction".into(), DataType::Float64); /// /// assert_eq!(earth.fields(), &[f1, f2]); /// # Ok::<(), PolarsError>(()) @@ -781,8 +794,8 @@ impl DataFrame { /// ```rust /// # use polars_core::prelude::*; /// let df1: DataFrame = df!("Element" => &["Copper", "Silver", "Gold"])?; - /// let s1: Series = Series::new("Proton", &[29, 47, 79]); - /// let s2: Series = Series::new("Electron", &[29, 47, 79]); + /// let s1: Series = Series::new("Proton".into(), &[29, 47, 79]); + /// let s2: Series = Series::new("Electron".into(), &[29, 47, 79]); /// /// let df2: DataFrame = df1.hstack(&[s1, s2])?; /// assert_eq!(df2.shape(), (3, 3)); @@ -980,7 +993,7 @@ impl DataFrame { /// assert!(s1.is_err()); /// /// let s2: Series = df.drop_in_place("Animal")?; - /// assert_eq!(s2, Series::new("Animal", &["Tiger", "Lion", "Great auk"])); + /// assert_eq!(s2, Series::new("Animal".into(), &["Tiger", "Lion", "Great auk"])); /// # Ok::<(), PolarsError>(()) /// ``` pub fn drop_in_place(&mut self, name: &str) -> PolarsResult { @@ -1016,22 +1029,26 @@ impl DataFrame { /// | Malta | 32.7 | /// +---------+---------------------+ /// ``` - pub fn drop_nulls>(&self, subset: Option<&[S]>) -> PolarsResult { - let selected_series; - - let mut iter = match subset { - Some(cols) => { - selected_series = self.select_series(cols)?; - selected_series.iter() - }, - None => self.columns.iter(), - }; + pub fn drop_nulls(&self, subset: Option<&[S]>) -> PolarsResult + where + for<'a> &'a S: Into, + { + if let Some(v) = subset { + let v = self.select_series(v)?; + self._drop_nulls_impl(v.as_slice()) + } else { + self._drop_nulls_impl(self.columns.as_slice()) + } + } + fn _drop_nulls_impl(&self, subset: &[Series]) -> PolarsResult { // fast path for no nulls in df - if iter.clone().all(|s| !s.has_nulls()) { + if subset.iter().all(|s| !s.has_nulls()) { return Ok(self.clone()); } + let mut iter = subset.iter(); + let mask = iter .next() .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?; @@ -1071,19 +1088,23 @@ impl DataFrame { } /// Drop columns that are in `names`. - pub fn drop_many>(&self, names: &[S]) -> Self { - let names: PlHashSet<_> = names.iter().map(|s| s.as_ref()).collect(); + pub fn drop_many(&self, names: I) -> Self + where + I: IntoIterator, + S: Into, + { + let names: PlHashSet = names.into_iter().map(|s| s.into()).collect(); self.drop_many_amortized(&names) } /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet). - pub fn drop_many_amortized(&self, names: &PlHashSet<&str>) -> DataFrame { + pub fn drop_many_amortized(&self, names: &PlHashSet) -> DataFrame { if names.is_empty() { return self.clone(); } let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len())); self.columns.iter().for_each(|s| { - if !names.contains(&s.name()) { + if !names.contains(s.name()) { new_cols.push(s.clone()) } }); @@ -1114,12 +1135,12 @@ impl DataFrame { column: S, ) -> PolarsResult<&mut Self> { let series = column.into_series(); - self.check_already_present(series.name())?; + self.check_already_present(series.name().as_str())?; self.insert_column_no_name_check(index, series) } fn add_column_by_search(&mut self, series: Series) -> PolarsResult<()> { - if let Some(idx) = self.get_column_index(series.name()) { + if let Some(idx) = self.get_column_index(series.name().as_str()) { self.replace_column(idx, series)?; } else { self.columns.push(series); @@ -1174,7 +1195,7 @@ impl DataFrame { fn add_column_by_schema(&mut self, s: Series, schema: &Schema) -> PolarsResult<()> { let name = s.name(); - if let Some((idx, _, _)) = schema.get_full(name) { + if let Some((idx, _, _)) = schema.get_full(name.as_str()) { // schema is incorrect fallback to search if self.columns.get(idx).map(|s| s.name()) != Some(name) { self.add_column_by_search(s)?; @@ -1191,7 +1212,7 @@ impl DataFrame { for (i, s) in columns.into_iter().enumerate() { // we need to branch here // because users can add multiple columns with the same name - if i == 0 || schema.get(s.name()).is_some() { + if i == 0 || schema.get(s.name().as_str()).is_some() { self.with_column_and_schema(s, schema)?; } else { self.with_column(s.clone())?; @@ -1265,7 +1286,7 @@ impl DataFrame { /// "Absolute magnitude" => &[4.83, -5.85, 1.42, 11.18])?; /// /// let s1: Option<&Series> = df.select_at_idx(0); - /// let s2: Series = Series::new("Star", &["Sun", "Betelgeuse", "Sirius A", "Sirius B"]); + /// let s2: Series = Series::new("Star".into(), &["Sun", "Betelgeuse", "Sirius A", "Sirius B"]); /// /// assert_eq!(s1, Some(&s2)); /// # Ok::<(), PolarsError>(()) @@ -1362,7 +1383,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn get_column_index(&self, name: &str) -> Option { - self.columns.iter().position(|s| s.name() == name) + self.columns.iter().position(|s| s.name().as_str() == name) } /// Get column index of a [`Series`] by name. @@ -1377,8 +1398,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s1: Series = Series::new("Password", &["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]); - /// let s2: Series = Series::new("Robustness", &["Weak", "Strong"]); + /// let s1: Series = Series::new("Password".into(), &["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]); + /// let s2: Series = Series::new("Robustness".into(), &["Weak", "Strong"]); /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?; /// /// assert_eq!(df.column("Password")?, &s1); @@ -1427,21 +1448,18 @@ impl DataFrame { pub fn select(&self, selection: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { - let cols = selection - .into_iter() - .map(|s| SmartString::from(s.as_ref())) - .collect::>(); - self._select_impl(&cols) + let cols = selection.into_iter().map(|s| s.into()).collect::>(); + self._select_impl(cols.as_slice()) } - pub fn _select_impl(&self, cols: &[SmartString]) -> PolarsResult { + pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult { ensure_names_unique(cols, |s| s.as_str())?; self._select_impl_unchecked(cols) } - pub fn _select_impl_unchecked(&self, cols: &[SmartString]) -> PolarsResult { + pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult { let selected = self.select_series_impl(cols)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) } @@ -1450,13 +1468,10 @@ impl DataFrame { pub fn select_with_schema(&self, selection: I, schema: &SchemaRef) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { - let cols = selection - .into_iter() - .map(|s| SmartString::from(s.as_ref())) - .collect::>(); - self.select_with_schema_impl(&cols, schema, true) + let cols = selection.into_iter().map(|s| s.into()).collect::>(); + self._select_with_schema_impl(&cols, schema, true) } /// Select with a known schema. This doesn't check for duplicates. @@ -1467,18 +1482,15 @@ impl DataFrame { ) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { - let cols = selection - .into_iter() - .map(|s| SmartString::from(s.as_ref())) - .collect::>(); - self.select_with_schema_impl(&cols, schema, false) + let cols = selection.into_iter().map(|s| s.into()).collect::>(); + self._select_with_schema_impl(&cols, schema, false) } - fn select_with_schema_impl( + pub fn _select_with_schema_impl( &self, - cols: &[SmartString], + cols: &[PlSmallStr], schema: &Schema, check_duplicates: bool, ) -> PolarsResult { @@ -1492,12 +1504,12 @@ impl DataFrame { /// A non generic implementation to reduce compiler bloat. fn select_series_impl_with_schema( &self, - cols: &[SmartString], + cols: &[PlSmallStr], schema: &Schema, ) -> PolarsResult> { cols.iter() .map(|name| { - let index = schema.try_get_full(name)?.0; + let index = schema.try_get_full(name.as_str())?.0; Ok(self.columns[index].clone()) }) .collect() @@ -1506,16 +1518,13 @@ impl DataFrame { pub fn select_physical(&self, selection: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { - let cols = selection - .into_iter() - .map(|s| SmartString::from(s.as_ref())) - .collect::>(); + let cols = selection.into_iter().map(|s| s.into()).collect::>(); self.select_physical_impl(&cols) } - fn select_physical_impl(&self, cols: &[SmartString]) -> PolarsResult { + fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult { ensure_names_unique(cols, |s| s.as_str())?; let selected = self.select_series_physical_impl(cols)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) @@ -1536,7 +1545,7 @@ impl DataFrame { /// assert_eq!(df["Hydrogen"], sv[1]); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn select_series(&self, selection: impl IntoVec) -> PolarsResult> { + pub fn select_series(&self, selection: impl IntoVec) -> PolarsResult> { let cols = selection.into_vec(); self.select_series_impl(&cols) } @@ -1545,12 +1554,12 @@ impl DataFrame { self.columns .iter() .enumerate() - .map(|(i, s)| (s.name(), i)) + .map(|(i, s)| (s.name().as_str(), i)) .collect() } /// A non generic implementation to reduce compiler bloat. - fn select_series_physical_impl(&self, cols: &[SmartString]) -> PolarsResult> { + fn select_series_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult> { let selected = if cols.len() > 1 && self.columns.len() > 10 { let name_to_idx = self._names_to_idx_map(); cols.iter() @@ -1567,7 +1576,10 @@ impl DataFrame { .collect::>>()? } else { cols.iter() - .map(|c| self.column(c).map(|s| s.to_physical_repr().into_owned())) + .map(|c| { + self.column(c.as_str()) + .map(|s| s.to_physical_repr().into_owned()) + }) .collect::>>()? }; @@ -1575,7 +1587,7 @@ impl DataFrame { } /// A non generic implementation to reduce compiler bloat. - fn select_series_impl(&self, cols: &[SmartString]) -> PolarsResult> { + fn select_series_impl(&self, cols: &[PlSmallStr]) -> PolarsResult> { let selected = if cols.len() > 1 && self.columns.len() > 10 { // we hash, because there are user that having millions of columns. // # https://github.com/pola-rs/polars/issues/1023 @@ -1591,7 +1603,7 @@ impl DataFrame { .collect::>>()? } else { cols.iter() - .map(|c| self.column(c).cloned()) + .map(|c| self.column(c.as_str()).cloned()) .collect::>>()? }; @@ -1636,7 +1648,7 @@ impl DataFrame { /// ``` /// # use polars_core::prelude::*; /// fn example(df: &DataFrame) -> PolarsResult { - /// let idx = IdxCa::new("idx", &[0, 1, 9]); + /// let idx = IdxCa::new("idx".into(), &[0, 1, 9]); /// df.take(&idx) /// } /// ``` @@ -1688,15 +1700,15 @@ impl DataFrame { /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> { /// let original_name = "foo"; /// let new_name = "bar"; - /// df.rename(original_name, new_name) + /// df.rename(original_name, new_name.into()) /// } /// ``` - pub fn rename(&mut self, column: &str, name: &str) -> PolarsResult<&mut Self> { - if column == name { + pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> { + if column == name.as_str() { return Ok(self); } polars_ensure!( - self.columns.iter().all(|c| c.name() != name), + self.columns.iter().all(|c| c.name() != &name), Duplicate: "column rename attempted with already existing name \"{name}\"" ); self.select_mut(column) @@ -1710,7 +1722,7 @@ impl DataFrame { /// See [`DataFrame::sort`] for more instruction. pub fn sort_in_place( &mut self, - by: impl IntoVec, + by: impl IntoVec, sort_options: SortMultipleOptions, ) -> PolarsResult<&mut Self> { let by_column = self.select_series(by)?; @@ -1782,7 +1794,7 @@ impl DataFrame { // fast path for a frame with a single series // no need to compute the sort indices and then take by these indices // simply sort and return as frame - if df.width() == 1 && df.check_name_to_idx(s.name()).is_ok() { + if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() { let mut out = s.sort_with(options)?; if let Some((offset, len)) = slice { out = out.slice(offset, len); @@ -1858,7 +1870,7 @@ impl DataFrame { /// Also see [`DataFrame::sort_in_place`]. pub fn sort( &self, - by: impl IntoVec, + by: impl IntoVec, sort_options: SortMultipleOptions, ) -> PolarsResult { let mut df = self.clone(); @@ -1874,7 +1886,7 @@ impl DataFrame { /// # use polars_core::prelude::*; /// let mut df: DataFrame = df!("Country" => &["United States", "China"], /// "Area (km²)" => &[9_833_520, 9_596_961])?; - /// let s: Series = Series::new("Country", &["USA", "PRC"]); + /// let s: Series = Series::new("Country".into(), &["USA", "PRC"]); /// /// assert!(df.replace("Nation", s.clone()).is_err()); /// assert!(df.replace("Country", s).is_ok()); @@ -1889,7 +1901,7 @@ impl DataFrame { /// of the `Series` passed to this method. pub fn replace_or_add( &mut self, - column: &str, + column: PlSmallStr, new_col: S, ) -> PolarsResult<&mut Self> { let mut new_col = new_col.into_series(); @@ -1903,8 +1915,8 @@ impl DataFrame { /// /// ```ignored /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo", &["ham", "spam", "egg"]); - /// let s1 = Series::new("ascii", &[70, 79, 79]); + /// let s0 = Series::new("foo".into(), &["ham", "spam", "egg"]); + /// let s1 = Series::new("ascii".into(), &[70, 79, 79]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// // Add 32 to get lowercase ascii values @@ -1940,8 +1952,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo", &["ham", "spam", "egg"]); - /// let s1 = Series::new("names", &["Jean", "Claude", "van"]); + /// let s0 = Series::new("foo".into(), &["ham", "spam", "egg"]); + /// let s1 = Series::new("names".into(), &["Jean", "Claude", "van"]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// fn str_to_len(str_val: &Series) -> Series { @@ -1990,8 +2002,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo", &["ham", "spam", "egg"]); - /// let s1 = Series::new("ascii", &[70, 79, 79]); + /// let s0 = Series::new("foo".into(), &["ham", "spam", "egg"]); + /// let s1 = Series::new("ascii".into(), &[70, 79, 79]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// // Add 32 to get lowercase ascii values @@ -2026,7 +2038,7 @@ impl DataFrame { idx, width ) })?; - let name = col.name().to_string(); + let name = col.name().clone(); let new_col = f(col).into_series(); match new_col.len() { 1 => { @@ -2046,7 +2058,7 @@ impl DataFrame { // make sure the name remains the same after applying the closure unsafe { let col = self.columns.get_unchecked_mut(idx); - col.rename(&name); + col.rename(name); } Ok(self) } @@ -2060,8 +2072,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo", &["ham", "spam", "egg", "bacon", "quack"]); - /// let s1 = Series::new("values", &[1, 2, 3, 4, 5]); + /// let s0 = Series::new("foo".into(), &["ham", "spam", "egg", "bacon", "quack"]); + /// let s1 = Series::new("values".into(), &[1, 2, 3, 4, 5]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// let idx = vec![0, 1, 4]; @@ -2103,14 +2115,14 @@ impl DataFrame { idx, width ) })?; - let name = col.name().to_string(); + let name = col.name().clone(); let _ = mem::replace(col, f(col).map(|s| s.into_series())?); // make sure the name remains the same after applying the closure unsafe { let col = self.columns.get_unchecked_mut(idx); - col.rename(&name); + col.rename(name); } Ok(self) } @@ -2124,8 +2136,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo", &["ham", "spam", "egg", "bacon", "quack"]); - /// let s1 = Series::new("values", &[1, 2, 3, 4, 5]); + /// let s0 = Series::new("foo".into(), &["ham", "spam", "egg", "bacon", "quack"]); + /// let s1 = Series::new("values".into(), &[1, 2, 3, 4, 5]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// // create a mask @@ -2653,32 +2665,39 @@ impl DataFrame { keep: UniqueKeepStrategy, slice: Option<(i64, usize)>, ) -> PolarsResult { - self.unique_impl(true, subset, keep, slice) + self.unique_impl( + true, + subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()), + keep, + slice, + ) } /// Unstable distinct. See [`DataFrame::unique_stable`]. #[cfg(feature = "algorithm_group_by")] - pub fn unique( + pub fn unique( &self, subset: Option<&[String]>, keep: UniqueKeepStrategy, slice: Option<(i64, usize)>, ) -> PolarsResult { - self.unique_impl(false, subset, keep, slice) + self.unique_impl( + false, + subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()), + keep, + slice, + ) } #[cfg(feature = "algorithm_group_by")] pub fn unique_impl( &self, maintain_order: bool, - subset: Option<&[String]>, + subset: Option>, keep: UniqueKeepStrategy, slice: Option<(i64, usize)>, ) -> PolarsResult { - let names = match &subset { - Some(s) => s.iter().map(|s| &**s).collect(), - None => self.get_column_names(), - }; + let names = subset.unwrap_or_else(|| self.get_column_names_owned()); let mut df = self.clone(); // take on multiple chunks is terrible df.as_single_chunk_par(); @@ -2755,7 +2774,7 @@ impl DataFrame { /// ``` #[cfg(feature = "algorithm_group_by")] pub fn is_unique(&self) -> PolarsResult { - let gb = self.group_by(self.get_column_names())?; + let gb = self.group_by(self.get_column_names_owned())?; let groups = gb.take_groups(); Ok(is_unique_helper( groups, @@ -2780,7 +2799,7 @@ impl DataFrame { /// ``` #[cfg(feature = "algorithm_group_by")] pub fn is_duplicated(&self) -> PolarsResult { - let gb = self.group_by(self.get_column_names())?; + let gb = self.group_by(self.get_column_names_owned())?; let groups = gb.take_groups(); Ok(is_unique_helper( groups, @@ -2796,7 +2815,7 @@ impl DataFrame { let cols = self .columns .iter() - .map(|s| Series::new(s.name(), &[s.null_count() as IdxSize])) + .map(|s| Series::new(s.name().clone(), &[s.null_count() as IdxSize])) .collect(); unsafe { Self::new_no_checks(cols) } } @@ -2862,7 +2881,7 @@ impl DataFrame { } } } - let mut ca = IdxCa::mmap_slice("", idx); + let mut ca = IdxCa::mmap_slice(PlSmallStr::const_default(), idx); ca.set_sorted_flag(sorted); self.take_unchecked_impl(&ca, allow_threads) } @@ -2871,21 +2890,21 @@ impl DataFrame { #[doc(hidden)] pub fn _partition_by_impl( &self, - cols: &[String], + cols: &[PlSmallStr], stable: bool, include_key: bool, ) -> PolarsResult> { let groups = if stable { - self.group_by_stable(cols)?.take_groups() + self.group_by_stable(cols.iter().cloned())?.take_groups() } else { - self.group_by(cols)?.take_groups() + self.group_by(cols.iter().cloned())?.take_groups() }; // drop key columns prior to calculation if requested let df = if include_key { self.clone() } else { - self.drop_many(cols) + self.drop_many(cols.iter().cloned()) }; // don't parallelize this @@ -2916,37 +2935,47 @@ impl DataFrame { /// Split into multiple DataFrames partitioned by groups #[cfg(feature = "partition_by")] - pub fn partition_by( - &self, - cols: impl IntoVec, - include_key: bool, - ) -> PolarsResult> { - let cols = cols.into_vec(); - self._partition_by_impl(&cols, false, include_key) + pub fn partition_by(&self, cols: I, include_key: bool) -> PolarsResult> + where + I: IntoIterator, + S: Into, + { + let cols = cols + .into_iter() + .map(Into::into) + .collect::>(); + self._partition_by_impl(cols.as_slice(), false, include_key) } /// Split into multiple DataFrames partitioned by groups /// Order of the groups are maintained. #[cfg(feature = "partition_by")] - pub fn partition_by_stable( + pub fn partition_by_stable( &self, - cols: impl IntoVec, + cols: I, include_key: bool, - ) -> PolarsResult> { - let cols = cols.into_vec(); - self._partition_by_impl(&cols, true, include_key) + ) -> PolarsResult> + where + I: IntoIterator, + S: Into, + { + let cols = cols + .into_iter() + .map(Into::into) + .collect::>(); + self._partition_by_impl(cols.as_slice(), true, include_key) } /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be /// inserted as columns. #[cfg(feature = "dtype-struct")] - pub fn unnest>(&self, cols: I) -> PolarsResult { + pub fn unnest>(&self, cols: I) -> PolarsResult { let cols = cols.into_vec(); self.unnest_impl(cols.into_iter().collect()) } #[cfg(feature = "dtype-struct")] - fn unnest_impl(&self, cols: PlHashSet) -> PolarsResult { + fn unnest_impl(&self, cols: PlHashSet) -> PolarsResult { let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128)); let mut count = 0; for s in &self.columns { @@ -2964,7 +2993,7 @@ impl DataFrame { let schema = self.schema(); for col in cols { let _ = schema - .get(&col) + .get(col.as_str()) .ok_or_else(|| polars_err!(col_not_found = col))?; } } @@ -3063,8 +3092,8 @@ mod test { use super::*; fn create_frame() -> DataFrame { - let s0 = Series::new("days", [0, 1, 2].as_ref()); - let s1 = Series::new("temp", [22.1, 19.9, 7.].as_ref()); + let s0 = Series::new("days".into(), [0, 1, 2].as_ref()); + let s1 = Series::new("temp".into(), [22.1, 19.9, 7.].as_ref()); DataFrame::new(vec![s0, s1]).unwrap() } @@ -3092,7 +3121,7 @@ mod test { fn test_filter_broadcast_on_string_col() { let col_name = "some_col"; let v = vec!["test".to_string()]; - let s0 = Series::new(col_name, v); + let s0 = Series::new(PlSmallStr::from_str(col_name), v); let mut df = DataFrame::new(vec![s0]).unwrap(); df = df @@ -3104,10 +3133,10 @@ mod test { #[test] #[cfg_attr(miri, ignore)] fn test_filter_broadcast_on_list_col() { - let s1 = Series::new("", &[true, false, true]); + let s1 = Series::new(PlSmallStr::const_default(), &[true, false, true]); let ll: ListChunked = [&s1].iter().copied().collect(); - let mask = BooleanChunked::from_slice("", &[false]); + let mask = BooleanChunked::from_slice(PlSmallStr::const_default(), &[false]); let new = ll.filter(&mask).unwrap(); assert_eq!(new.chunks.len(), 1); @@ -3135,8 +3164,8 @@ mod test { )?; // Create a series with multiple chunks - let mut s = Series::new("foo", 0..2); - let s2 = Series::new("bar", 0..1); + let mut s = Series::new("foo".into(), 0..2); + let s2 = Series::new("bar".into(), 0..1); s.append(&s2)?; // Append series to frame @@ -3154,8 +3183,12 @@ mod test { } .unwrap(); // check if column is replaced - assert!(df.with_column(Series::new("foo", &[1, 2, 3])).is_ok()); - assert!(df.with_column(Series::new("bar", &[1, 2, 3])).is_ok()); + assert!(df + .with_column(Series::new("foo".into(), &[1, 2, 3])) + .is_ok()); + assert!(df + .with_column(Series::new("bar".into(), &[1, 2, 3])) + .is_ok()); assert!(df.column("bar").is_ok()) } @@ -3200,9 +3233,9 @@ mod test { #[cfg(feature = "zip_with")] #[cfg_attr(miri, ignore)] fn test_horizontal_agg() { - let a = Series::new("a", &[1, 2, 6]); - let b = Series::new("b", &[Some(1), None, None]); - let c = Series::new("c", &[Some(4), None, Some(3)]); + let a = Series::new("a".into(), &[1, 2, 6]); + let b = Series::new("b".into(), &[Some(1), None, None]); + let c = Series::new("c".into(), &[Some(4), None, Some(3)]); let df = DataFrame::new(vec![a, b, c]).unwrap(); assert_eq!( @@ -3243,7 +3276,7 @@ mod test { )?; // check that the new column is "c" and not "bar". - df.replace_or_add("c", Series::new("bar", [1, 2, 3]))?; + df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?; assert_eq!(df.get_column_names(), &["a", "b", "c"]); Ok(()) @@ -3258,13 +3291,13 @@ mod test { // has got columns, but no rows let mut df = base.clear(); - let out = df.with_column(Series::new("c", [1]))?; + let out = df.with_column(Series::new("c".into(), [1]))?; assert_eq!(out.shape(), (0, 3)); assert!(out.iter().all(|s| s.len() == 0)); // no columns base.columns = vec![]; - let out = base.with_column(Series::new("c", [1]))?; + let out = base.with_column(Series::new("c".into(), [1]))?; assert_eq!(out.shape(), (1, 1)); Ok(()) diff --git a/crates/polars-core/src/frame/row/av_buffer.rs b/crates/polars-core/src/frame/row/av_buffer.rs index cedf4da1799f..e804e39ffe8a 100644 --- a/crates/polars-core/src/frame/row/av_buffer.rs +++ b/crates/polars-core/src/frame/row/av_buffer.rs @@ -1,8 +1,8 @@ #[cfg(feature = "dtype-struct")] +use polars_utils::pl_str::PlSmallStr; +#[cfg(feature = "dtype-struct")] use polars_utils::slice::GetSaferUnchecked; use polars_utils::unreachable_unchecked_release; -#[cfg(feature = "dtype-struct")] -use smartstring::alias::String as SmartString; use super::*; use crate::chunked_array::builder::NullChunkedBuilder; @@ -67,7 +67,7 @@ impl<'a> AnyValueBuffer<'a> { (Float32(builder), val) => builder.append_value(val.extract()?), (Float64(builder), val) => builder.append_value(val.extract()?), (String(builder), AnyValue::String(v)) => builder.append_value(v), - (String(builder), AnyValue::StringOwned(v)) => builder.append_value(v), + (String(builder), AnyValue::StringOwned(v)) => builder.append_value(v.as_str()), (String(builder), AnyValue::Null) => builder.append_null(), #[cfg(feature = "dtype-i8")] (Int8(builder), AnyValue::Null) => builder.append_null(), @@ -151,39 +151,39 @@ impl<'a> AnyValueBuffer<'a> { use AnyValueBuffer::*; match self { Boolean(b) => { - let mut new = BooleanChunkedBuilder::new(b.field.name(), capacity); + let mut new = BooleanChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Int32(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Int64(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, UInt32(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, UInt64(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-date")] Date(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_date().into_series() }, #[cfg(feature = "dtype-datetime")] Datetime(b, tu, tz) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); let tz = if capacity > 0 { tz.clone() @@ -194,62 +194,68 @@ impl<'a> AnyValueBuffer<'a> { }, #[cfg(feature = "dtype-duration")] Duration(b, tu) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_duration(*tu).into_series() }, #[cfg(feature = "dtype-time")] Time(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_time().into_series() }, Float32(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Float64(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, String(b) => { - let mut new = StringChunkedBuilder::new(b.field.name(), capacity); + let mut new = StringChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-i8")] Int8(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-i16")] Int16(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-u8")] UInt8(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-u16")] UInt16(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Null(b) => { - let mut new = NullChunkedBuilder::new(b.field.name(), 0); + let mut new = NullChunkedBuilder::new(b.field.name().clone(), 0); std::mem::swap(&mut new, b); new.finish().into_series() }, All(dtype, vals) => { - let out = Series::from_any_values_and_dtype("", vals, dtype, false).unwrap(); + let out = Series::from_any_values_and_dtype( + PlSmallStr::const_default(), + vals, + dtype, + false, + ) + .unwrap(); let mut new = Vec::with_capacity(capacity); std::mem::swap(&mut new, vals); out @@ -272,33 +278,79 @@ impl From<(&DataType, usize)> for AnyValueBuffer<'_> { let (dt, len) = a; use DataType::*; match dt { - Boolean => AnyValueBuffer::Boolean(BooleanChunkedBuilder::new("", len)), - Int32 => AnyValueBuffer::Int32(PrimitiveChunkedBuilder::new("", len)), - Int64 => AnyValueBuffer::Int64(PrimitiveChunkedBuilder::new("", len)), - UInt32 => AnyValueBuffer::UInt32(PrimitiveChunkedBuilder::new("", len)), - UInt64 => AnyValueBuffer::UInt64(PrimitiveChunkedBuilder::new("", len)), + Boolean => AnyValueBuffer::Boolean(BooleanChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Int32 => AnyValueBuffer::Int32(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Int64 => AnyValueBuffer::Int64(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + UInt32 => AnyValueBuffer::UInt32(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + UInt64 => AnyValueBuffer::UInt64(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-i8")] - Int8 => AnyValueBuffer::Int8(PrimitiveChunkedBuilder::new("", len)), + Int8 => AnyValueBuffer::Int8(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-i16")] - Int16 => AnyValueBuffer::Int16(PrimitiveChunkedBuilder::new("", len)), + Int16 => AnyValueBuffer::Int16(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-u8")] - UInt8 => AnyValueBuffer::UInt8(PrimitiveChunkedBuilder::new("", len)), + UInt8 => AnyValueBuffer::UInt8(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-u16")] - UInt16 => AnyValueBuffer::UInt16(PrimitiveChunkedBuilder::new("", len)), + UInt16 => AnyValueBuffer::UInt16(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-date")] - Date => AnyValueBuffer::Date(PrimitiveChunkedBuilder::new("", len)), + Date => AnyValueBuffer::Date(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-datetime")] - Datetime(tu, tz) => { - AnyValueBuffer::Datetime(PrimitiveChunkedBuilder::new("", len), *tu, tz.clone()) - }, + Datetime(tu, tz) => AnyValueBuffer::Datetime( + PrimitiveChunkedBuilder::new(PlSmallStr::const_default(), len), + *tu, + tz.clone(), + ), #[cfg(feature = "dtype-duration")] - Duration(tu) => AnyValueBuffer::Duration(PrimitiveChunkedBuilder::new("", len), *tu), + Duration(tu) => AnyValueBuffer::Duration( + PrimitiveChunkedBuilder::new(PlSmallStr::const_default(), len), + *tu, + ), #[cfg(feature = "dtype-time")] - Time => AnyValueBuffer::Time(PrimitiveChunkedBuilder::new("", len)), - Float32 => AnyValueBuffer::Float32(PrimitiveChunkedBuilder::new("", len)), - Float64 => AnyValueBuffer::Float64(PrimitiveChunkedBuilder::new("", len)), - String => AnyValueBuffer::String(StringChunkedBuilder::new("", len)), - Null => AnyValueBuffer::Null(NullChunkedBuilder::new("", 0)), + Time => AnyValueBuffer::Time(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Float32 => AnyValueBuffer::Float32(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Float64 => AnyValueBuffer::Float64(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + String => { + AnyValueBuffer::String(StringChunkedBuilder::new(PlSmallStr::const_default(), len)) + }, + Null => AnyValueBuffer::Null(NullChunkedBuilder::new(PlSmallStr::const_default(), 0)), // Struct and List can be recursive so use AnyValues for that dt => AnyValueBuffer::All(dt.clone(), Vec::with_capacity(len)), } @@ -326,7 +378,7 @@ pub enum AnyValueBufferTrusted<'a> { String(StringChunkedBuilder), #[cfg(feature = "dtype-struct")] // not the trusted variant! - Struct(Vec<(AnyValueBuffer<'a>, SmartString)>), + Struct(Vec<(AnyValueBuffer<'a>, PlSmallStr)>), Null(NullChunkedBuilder), All(DataType, Vec>), } @@ -471,7 +523,7 @@ impl<'a> AnyValueBufferTrusted<'a> { let AnyValue::StringOwned(v) = val else { unreachable_unchecked_release!() }; - builder.append_value(v) + builder.append_value(v.as_str()) }, #[cfg(feature = "dtype-struct")] Struct(builders) => { @@ -542,66 +594,66 @@ impl<'a> AnyValueBufferTrusted<'a> { use AnyValueBufferTrusted::*; match self { Boolean(b) => { - let mut new = BooleanChunkedBuilder::new(b.field.name(), capacity); + let mut new = BooleanChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Int32(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Int64(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, UInt32(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, UInt64(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Float32(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, Float64(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, String(b) => { - let mut new = StringChunkedBuilder::new(b.field.name(), capacity); + let mut new = StringChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-i8")] Int8(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-i16")] Int16(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-u8")] UInt8(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, #[cfg(feature = "dtype-u16")] UInt16(b) => { - let mut new = PrimitiveChunkedBuilder::new(b.field.name(), capacity); + let mut new = PrimitiveChunkedBuilder::new(b.field.name().clone(), capacity); std::mem::swap(&mut new, b); new.finish().into_series() }, @@ -611,21 +663,29 @@ impl<'a> AnyValueBufferTrusted<'a> { .iter_mut() .map(|(b, name)| { let mut s = b.reset(capacity); - s.rename(name.as_str()); + s.rename(name.clone()); s }) .collect::>(); - StructChunked::from_series("", &v).unwrap().into_series() + StructChunked::from_series(PlSmallStr::const_default(), &v) + .unwrap() + .into_series() }, Null(b) => { - let mut new = NullChunkedBuilder::new(b.field.name(), 0); + let mut new = NullChunkedBuilder::new(b.field.name().clone(), 0); std::mem::swap(&mut new, b); new.finish().into_series() }, All(dtype, vals) => { let mut swap_vals = Vec::with_capacity(capacity); std::mem::swap(vals, &mut swap_vals); - Series::from_any_values_and_dtype("", &swap_vals, dtype, false).unwrap() + Series::from_any_values_and_dtype( + PlSmallStr::const_default(), + &swap_vals, + dtype, + false, + ) + .unwrap() }, } } @@ -640,22 +700,58 @@ impl From<(&DataType, usize)> for AnyValueBufferTrusted<'_> { let (dt, len) = a; use DataType::*; match dt { - Boolean => AnyValueBufferTrusted::Boolean(BooleanChunkedBuilder::new("", len)), - Int32 => AnyValueBufferTrusted::Int32(PrimitiveChunkedBuilder::new("", len)), - Int64 => AnyValueBufferTrusted::Int64(PrimitiveChunkedBuilder::new("", len)), - UInt32 => AnyValueBufferTrusted::UInt32(PrimitiveChunkedBuilder::new("", len)), - UInt64 => AnyValueBufferTrusted::UInt64(PrimitiveChunkedBuilder::new("", len)), + Boolean => AnyValueBufferTrusted::Boolean(BooleanChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Int32 => AnyValueBufferTrusted::Int32(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Int64 => AnyValueBufferTrusted::Int64(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + UInt32 => AnyValueBufferTrusted::UInt32(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + UInt64 => AnyValueBufferTrusted::UInt64(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-i8")] - Int8 => AnyValueBufferTrusted::Int8(PrimitiveChunkedBuilder::new("", len)), + Int8 => AnyValueBufferTrusted::Int8(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-i16")] - Int16 => AnyValueBufferTrusted::Int16(PrimitiveChunkedBuilder::new("", len)), + Int16 => AnyValueBufferTrusted::Int16(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-u8")] - UInt8 => AnyValueBufferTrusted::UInt8(PrimitiveChunkedBuilder::new("", len)), + UInt8 => AnyValueBufferTrusted::UInt8(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-u16")] - UInt16 => AnyValueBufferTrusted::UInt16(PrimitiveChunkedBuilder::new("", len)), - Float32 => AnyValueBufferTrusted::Float32(PrimitiveChunkedBuilder::new("", len)), - Float64 => AnyValueBufferTrusted::Float64(PrimitiveChunkedBuilder::new("", len)), - String => AnyValueBufferTrusted::String(StringChunkedBuilder::new("", len)), + UInt16 => AnyValueBufferTrusted::UInt16(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Float32 => AnyValueBufferTrusted::Float32(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + Float64 => AnyValueBufferTrusted::Float64(PrimitiveChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), + String => AnyValueBufferTrusted::String(StringChunkedBuilder::new( + PlSmallStr::const_default(), + len, + )), #[cfg(feature = "dtype-struct")] Struct(fields) => { let buffers = fields diff --git a/crates/polars-core/src/frame/row/dataframe.rs b/crates/polars-core/src/frame/row/dataframe.rs index f9e60cebcd0e..a2fe1041abb5 100644 --- a/crates/polars-core/src/frame/row/dataframe.rs +++ b/crates/polars-core/src/frame/row/dataframe.rs @@ -79,9 +79,9 @@ impl DataFrame { // if the schema adds a column not in the rows, we // fill it with nulls if s.is_empty() { - Series::full_null(name, expected_len, s.dtype()) + Series::full_null(name.clone(), expected_len, s.dtype()) } else { - s.rename(name); + s.rename(name.clone()); s } }) @@ -121,9 +121,9 @@ impl DataFrame { // if the schema adds a column not in the rows, we // fill it with nulls if s.is_empty() { - Series::full_null(name, expected_len, s.dtype()) + Series::full_null(name.clone(), expected_len, s.dtype()) } else { - s.rename(name); + s.rename(name.clone()); s } }) diff --git a/crates/polars-core/src/frame/row/mod.rs b/crates/polars-core/src/frame/row/mod.rs index e9cf92ffad13..fc565907c00a 100644 --- a/crates/polars-core/src/frame/row/mod.rs +++ b/crates/polars-core/src/frame/row/mod.rs @@ -10,6 +10,7 @@ use std::hint::unreachable_unchecked; use arrow::bitmap::Bitmap; pub use av_buffer::*; +use polars_utils::format_pl_smallstr; #[cfg(feature = "object")] use polars_utils::total_ord::TotalHash; use rayon::prelude::*; @@ -96,10 +97,10 @@ impl<'a> Row<'a> { } } -type Tracker = PlIndexMap>; +type Tracker = PlIndexMap>; pub fn infer_schema( - iter: impl Iterator)>>, + iter: impl Iterator, impl Into)>>, infer_schema_length: usize, ) -> Schema { let mut values: Tracker = Tracker::default(); @@ -108,25 +109,25 @@ pub fn infer_schema( let max_infer = std::cmp::min(len, infer_schema_length); for inner in iter.take(max_infer) { for (key, value) in inner { - add_or_insert(&mut values, &key, value.into()); + add_or_insert(&mut values, key.into(), value.into()); } } Schema::from_iter(resolve_fields(values)) } -fn add_or_insert(values: &mut Tracker, key: &str, data_type: DataType) { +fn add_or_insert(values: &mut Tracker, key: PlSmallStr, data_type: DataType) { if data_type == DataType::Null { return; } - if values.contains_key(key) { - let x = values.get_mut(key).unwrap(); + if values.contains_key(&key) { + let x = values.get_mut(&key).unwrap(); x.insert(data_type); } else { // create hashset and add value type let mut hs = PlHashSet::new(); hs.insert(data_type); - values.insert(key.to_string(), hs); + values.insert(key, hs); } } @@ -134,7 +135,7 @@ fn resolve_fields(spec: Tracker) -> Vec { spec.iter() .map(|(k, hs)| { let v: Vec<&DataType> = hs.iter().collect(); - Field::new(k, coerce_data_type(&v)) + Field::new(k.clone(), coerce_data_type(&v)) }) .collect() } @@ -237,7 +238,7 @@ pub fn rows_to_schema_first_non_null( impl<'a> From<&AnyValue<'a>> for Field { fn from(val: &AnyValue<'a>) -> Self { - Field::new("", val.into()) + Field::new(PlSmallStr::const_default(), val.into()) } } @@ -248,7 +249,7 @@ impl From<&Row<'_>> for Schema { .enumerate() .map(|(i, av)| { let dtype = av.into(); - Field::new(format!("column_{i}").as_ref(), dtype) + Field::new(format_pl_smallstr!("column_{i}"), dtype) }) .collect() } diff --git a/crates/polars-core/src/frame/row/transpose.rs b/crates/polars-core/src/frame/row/transpose.rs index 7ad4bc4f1fef..1984a085116f 100644 --- a/crates/polars-core/src/frame/row/transpose.rs +++ b/crates/polars-core/src/frame/row/transpose.rs @@ -8,8 +8,8 @@ impl DataFrame { pub(crate) fn transpose_from_dtype( &self, dtype: &DataType, - keep_names_as: Option<&str>, - names_out: &[String], + keep_names_as: Option, + names_out: &[PlSmallStr], ) -> PolarsResult { let new_width = self.height(); let new_height = self.width(); @@ -18,7 +18,13 @@ impl DataFrame { None => Vec::::with_capacity(new_width), Some(name) => { let mut tmp = Vec::::with_capacity(new_width + 1); - tmp.push(StringChunked::new(name, self.get_column_names()).into()); + tmp.push( + StringChunked::from_iter_values( + name, + self.get_column_names_owned().into_iter(), + ) + .into(), + ); tmp }, }; @@ -74,7 +80,7 @@ impl DataFrame { cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| { // SAFETY: we are casting back to the supertype let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() }; - s.rename(name); + s.rename(name.clone()); s })); }, @@ -82,26 +88,43 @@ impl DataFrame { Ok(unsafe { DataFrame::new_no_checks(cols_t) }) } - /// Transpose a DataFrame. This is a very expensive operation. pub fn transpose( &mut self, keep_names_as: Option<&str>, new_col_names: Option>>, + ) -> PolarsResult { + let new_col_names = match new_col_names { + None => None, + Some(Either::Left(v)) => Some(Either::Left(v.into())), + Some(Either::Right(v)) => Some(Either::Right( + v.into_iter().map(Into::into).collect::>(), + )), + }; + + self.transpose_impl(keep_names_as, new_col_names) + } + /// Transpose a DataFrame. This is a very expensive operation. + pub fn transpose_impl( + &mut self, + keep_names_as: Option<&str>, + new_col_names: Option>>, ) -> PolarsResult { // We must iterate columns as [`AnyValue`], so we must be contiguous. self.as_single_chunk_par(); let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column let names_out = match new_col_names { - None => (0..self.height()).map(|i| format!("column_{i}")).collect(), + None => (0..self.height()) + .map(|i| format_pl_smallstr!("column_{i}")) + .collect(), Some(cn) => match cn { Either::Left(name) => { - let new_names = self.column(&name).and_then(|x| x.str())?; + let new_names = self.column(name.as_str()).and_then(|x| x.str())?; polars_ensure!(new_names.null_count() == 0, ComputeError: "Column with new names can't have null values"); - df = Cow::Owned(self.drop(&name)?); + df = Cow::Owned(self.drop(name.as_str())?); new_names .into_no_null_iter() - .map(|s| s.to_owned()) + .map(PlSmallStr::from_str) .collect() }, Either::Right(names) => { @@ -141,7 +164,7 @@ impl DataFrame { }, _ => {}, } - df.transpose_from_dtype(&dtype, keep_names_as, &names_out) + df.transpose_from_dtype(&dtype, keep_names_as.map(PlSmallStr::from_str), &names_out) } } @@ -159,8 +182,11 @@ unsafe fn add_value( // This just fills a pre-allocated mutable series vector, which may have a name column. // Nothing is returned and the actual DataFrame is constructed above. -pub(super) fn numeric_transpose(cols: &[Series], names_out: &[String], cols_t: &mut Vec) -where +pub(super) fn numeric_transpose( + cols: &[Series], + names_out: &[PlSmallStr], + cols_t: &mut Vec, +) where T: PolarsNumericType, //S: AsRef, ChunkedArray: IntoSeries, @@ -251,7 +277,7 @@ where values.into(), validity, ); - ChunkedArray::with_chunk(name.as_str(), arr).into_series() + ChunkedArray::with_chunk(name.clone(), arr).into_series() }); POOL.install(|| cols_t.par_extend(par_iter)); } diff --git a/crates/polars-core/src/functions.rs b/crates/polars-core/src/functions.rs index 6ca5548f000f..57cbee3a01dc 100644 --- a/crates/polars-core/src/functions.rs +++ b/crates/polars-core/src/functions.rs @@ -19,8 +19,8 @@ pub fn concat_df_diagonal(dfs: &[DataFrame]) -> PolarsResult { for df in dfs { df.get_columns().iter().for_each(|s| { - let name = s.name(); - if column_names.insert(name) { + let name = s.name().clone(); + if column_names.insert(name.clone()) { schema.push((name, s.dtype())) } }); @@ -33,9 +33,9 @@ pub fn concat_df_diagonal(dfs: &[DataFrame]) -> PolarsResult { let mut columns = Vec::with_capacity(schema.len()); for (name, dtype) in &schema { - match df.column(name).ok() { + match df.column(name.as_str()).ok() { Some(s) => columns.push(s.clone()), - None => columns.push(Series::full_null(name, height, dtype)), + None => columns.push(Series::full_null(name.clone(), height, dtype)), } } unsafe { DataFrame::new_no_checks(columns) } diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index ae37832f8619..34d8f085c5f1 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -451,7 +451,7 @@ pub fn _df_rows_to_hashes_threaded_vertical( let hb = hasher_builder.clone(); let mut hashes = vec![]; series_to_hashes(df.get_columns(), Some(hb), &mut hashes)?; - Ok(UInt64Chunked::from_vec("", hashes)) + Ok(UInt64Chunked::from_vec(PlSmallStr::const_default(), hashes)) }) .collect::>>() })?; diff --git a/crates/polars-core/src/named_from.rs b/crates/polars-core/src/named_from.rs index 8bcc17cef853..4d5714e4e517 100644 --- a/crates/polars-core/src/named_from.rs +++ b/crates/polars-core/src/named_from.rs @@ -14,18 +14,18 @@ use crate::prelude::*; pub trait NamedFrom { /// Initialize by name and values. - fn new(name: &str, _: T) -> Self; + fn new(name: PlSmallStr, _: T) -> Self; } pub trait NamedFromOwned { /// Initialize by name and values. - fn from_vec(name: &str, _: T) -> Self; + fn from_vec(name: PlSmallStr, _: T) -> Self; } macro_rules! impl_named_from_owned { ($type:ty, $polars_type:ident) => { impl NamedFromOwned<$type> for Series { - fn from_vec(name: &str, v: $type) -> Self { + fn from_vec(name: PlSmallStr, v: $type) -> Self { ChunkedArray::<$polars_type>::from_vec(name, v).into_series() } } @@ -52,12 +52,12 @@ impl_named_from_owned!(Vec, Float64Type); macro_rules! impl_named_from { ($type:ty, $polars_type:ident, $method:ident) => { impl> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { ChunkedArray::<$polars_type>::$method(name, v.as_ref()).into_series() } } impl> NamedFrom for ChunkedArray<$polars_type> { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { ChunkedArray::<$polars_type>::$method(name, v.as_ref()) } } @@ -106,14 +106,14 @@ impl_named_from!([Option], Float64Type, from_slice_options); macro_rules! impl_named_from_range { ($range:ty, $polars_type:ident) => { impl NamedFrom<$range, $polars_type> for ChunkedArray<$polars_type> { - fn new(name: &str, range: $range) -> Self { + fn new(name: PlSmallStr, range: $range) -> Self { let values = range.collect::>(); ChunkedArray::<$polars_type>::from_vec(name, values) } } impl NamedFrom<$range, $polars_type> for Series { - fn new(name: &str, range: $range) -> Self { + fn new(name: PlSmallStr, range: $range) -> Self { ChunkedArray::new(name, range).into_series() } } @@ -125,7 +125,7 @@ impl_named_from_range!(std::ops::Range, UInt64Type); impl_named_from_range!(std::ops::Range, UInt32Type); impl> NamedFrom for Series { - fn new(name: &str, s: T) -> Self { + fn new(name: PlSmallStr, s: T) -> Self { let series_slice = s.as_ref(); let list_cap = series_slice.len(); @@ -155,7 +155,7 @@ impl> NamedFrom for Series { } impl]>> NamedFrom]> for Series { - fn new(name: &str, s: T) -> Self { + fn new(name: PlSmallStr, s: T) -> Self { let series_slice = s.as_ref(); let values_cap = series_slice.iter().fold(0, |acc, opt_s| { acc + opt_s.as_ref().map(|s| s.len()).unwrap_or(0) @@ -173,13 +173,13 @@ impl]>> NamedFrom]> for Series { } } impl<'a, T: AsRef<[&'a str]>> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_slice(name, v.as_ref()).into_series() } } impl NamedFrom<&Series, str> for Series { - fn new(name: &str, s: &Series) -> Self { + fn new(name: PlSmallStr, s: &Series) -> Self { let mut s = s.clone(); s.rename(name); s @@ -187,44 +187,44 @@ impl NamedFrom<&Series, str> for Series { } impl<'a, T: AsRef<[&'a str]>> NamedFrom for StringChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_slice(name, v.as_ref()) } } impl<'a, T: AsRef<[Option<&'a str>]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_slice_options(name, v.as_ref()).into_series() } } impl<'a, T: AsRef<[Option<&'a str>]>> NamedFrom]> for StringChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_slice_options(name, v.as_ref()) } } impl<'a, T: AsRef<[Cow<'a, str>]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) .into_series() } } impl<'a, T: AsRef<[Cow<'a, str>]>> NamedFrom]> for StringChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) } } impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::new(name, v).into_series() } } impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for StringChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { StringChunked::from_iter_options( name, v.as_ref() @@ -235,44 +235,44 @@ impl<'a, T: AsRef<[Option>]>> NamedFrom>]> } impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_slice(name, v.as_ref()).into_series() } } impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for BinaryChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_slice(name, v.as_ref()) } } impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_slice_options(name, v.as_ref()).into_series() } } impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for BinaryChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_slice_options(name, v.as_ref()) } } impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) .into_series() } } impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for BinaryChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) } } impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::new(name, v).into_series() } } @@ -280,7 +280,7 @@ impl<'a, T: AsRef<[Option>]>> NamedFrom>] impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for BinaryChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { BinaryChunked::from_iter_options( name, v.as_ref() @@ -292,35 +292,35 @@ impl<'a, T: AsRef<[Option>]>> NamedFrom>] #[cfg(feature = "dtype-date")] impl> NamedFrom for DateChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DateChunked::from_naive_date(name, v.as_ref().iter().copied()) } } #[cfg(feature = "dtype-date")] impl> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DateChunked::new(name, v).into_series() } } #[cfg(feature = "dtype-date")] impl]>> NamedFrom]> for DateChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DateChunked::from_naive_date_options(name, v.as_ref().iter().copied()) } } #[cfg(feature = "dtype-date")] impl]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DateChunked::new(name, v).into_series() } } #[cfg(feature = "dtype-datetime")] impl> NamedFrom for DatetimeChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DatetimeChunked::from_naive_datetime( name, v.as_ref().iter().copied(), @@ -331,14 +331,14 @@ impl> NamedFrom for DatetimeChunke #[cfg(feature = "dtype-datetime")] impl> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DatetimeChunked::new(name, v).into_series() } } #[cfg(feature = "dtype-datetime")] impl]>> NamedFrom]> for DatetimeChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DatetimeChunked::from_naive_datetime_options( name, v.as_ref().iter().copied(), @@ -349,21 +349,21 @@ impl]>> NamedFrom]> fo #[cfg(feature = "dtype-datetime")] impl]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DatetimeChunked::new(name, v).into_series() } } #[cfg(feature = "dtype-duration")] impl> NamedFrom for DurationChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DurationChunked::from_duration(name, v.as_ref().iter().copied(), TimeUnit::Nanoseconds) } } #[cfg(feature = "dtype-duration")] impl> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DurationChunked::new(name, v).into_series() } } @@ -372,7 +372,7 @@ impl> NamedFrom for Series { impl]>> NamedFrom]> for DurationChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DurationChunked::from_duration_options( name, v.as_ref().iter().copied(), @@ -383,49 +383,49 @@ impl]>> NamedFrom]> #[cfg(feature = "dtype-duration")] impl]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { DurationChunked::new(name, v).into_series() } } #[cfg(feature = "dtype-time")] impl> NamedFrom for TimeChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { TimeChunked::from_naive_time(name, v.as_ref().iter().copied()) } } #[cfg(feature = "dtype-time")] impl> NamedFrom for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { TimeChunked::new(name, v).into_series() } } #[cfg(feature = "dtype-time")] impl]>> NamedFrom]> for TimeChunked { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { TimeChunked::from_naive_time_options(name, v.as_ref().iter().copied()) } } #[cfg(feature = "dtype-time")] impl]>> NamedFrom]> for Series { - fn new(name: &str, v: T) -> Self { + fn new(name: PlSmallStr, v: T) -> Self { TimeChunked::new(name, v).into_series() } } #[cfg(feature = "object")] impl NamedFrom<&[T], &[T]> for ObjectChunked { - fn new(name: &str, v: &[T]) -> Self { + fn new(name: PlSmallStr, v: &[T]) -> Self { ObjectChunked::from_slice(name, v) } } #[cfg(feature = "object")] impl]>> NamedFrom]> for ObjectChunked { - fn new(name: &str, v: S) -> Self { + fn new(name: PlSmallStr, v: S) -> Self { ObjectChunked::from_slice_options(name, v.as_ref()) } } @@ -433,14 +433,14 @@ impl]>> NamedFrom]> for Objec impl ChunkedArray { /// Specialization that prevents an allocation /// prefer this over ChunkedArray::new when you have a `Vec` and no null values. - pub fn new_vec(name: &str, v: Vec) -> Self { + pub fn new_vec(name: PlSmallStr, v: Vec) -> Self { ChunkedArray::from_vec(name, v) } } /// For any [`ChunkedArray`] and [`Series`] impl NamedFrom for Series { - fn new(name: &str, t: T) -> Self { + fn new(name: PlSmallStr, t: T) -> Self { let mut s = t.into_series(); s.rename(name); s @@ -474,9 +474,9 @@ mod test { #[test] fn build_series_from_empty_series_vec() { - let empty_series = Series::new("test", Vec::::new()); + let empty_series = Series::new("test".into(), Vec::::new()); assert_eq!(empty_series.len(), 0); assert_eq!(*empty_series.dtype(), DataType::Null); - assert_eq!(empty_series.name(), "test"); + assert_eq!(empty_series.name().as_str(), "test"); } } diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index 479809b5feef..996c9b83c5c5 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -7,6 +7,7 @@ pub use arrow::datatypes::{ArrowSchema, Field as ArrowField}; pub use arrow::legacy::prelude::*; pub(crate) use arrow::trusted_len::TrustedLen; pub use polars_utils::index::{ChunkId, IdxSize, NullableChunkId, NullableIdxSize}; +pub use polars_utils::pl_str::PlSmallStr; pub(crate) use polars_utils::total_ord::{TotalEq, TotalOrd}; pub use crate::chunked_array::arithmetic::ArithmeticChunked; diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index 07ed78b0863f..622b614dfb07 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -1,5 +1,7 @@ pub mod reduce; +use polars_utils::pl_str::PlSmallStr; + use crate::datatypes::{AnyValue, DataType}; use crate::prelude::Series; @@ -24,7 +26,7 @@ impl Scalar { .unwrap_or_else(|| self.value.clone()) } - pub fn into_series(self, name: &str) -> Series { + pub fn into_series(self, name: PlSmallStr) -> Series { Series::from_any_values_and_dtype(name, &[self.as_any_value()], &self.dtype, true).unwrap() } diff --git a/crates/polars-core/src/schema.rs b/crates/polars-core/src/schema.rs index 8f04d1bb20be..9354d951d05d 100644 --- a/crates/polars-core/src/schema.rs +++ b/crates/polars-core/src/schema.rs @@ -6,18 +6,18 @@ use indexmap::map::MutableKeys; use indexmap::IndexMap; use polars_utils::aliases::PlRandomState; use polars_utils::itertools::Itertools; +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde-lazy")] use serde::{Deserialize, Serialize}; -use smartstring::alias::String as SmartString; use crate::prelude::*; use crate::utils::try_get_supertype; -/// A map from field/column name ([`String`](smartstring::alias::String)) to the type of that field/column ([`DataType`]) +/// A map from field/column name to the type of that field/column ([`DataType`]) #[derive(Eq, Clone, Default)] #[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))] pub struct Schema { - inner: PlIndexMap, + inner: PlIndexMap, } impl Hash for Schema { @@ -103,7 +103,7 @@ impl Schema { /// /// If `old` is not present in the schema, the schema is not modified and `None` is returned. Otherwise the schema /// is updated and `Some(old_name)` is returned. - pub fn rename(&mut self, old: &str, new: SmartString) -> Option { + pub fn rename(&mut self, old: &str, new: PlSmallStr) -> Option { // Remove `old`, get the corresponding index and dtype, and move the last item in the map to that position let (old_index, old_name, dtype) = self.inner.swap_remove_full(old)?; // Insert the same dtype under the new name at the end of the map and store that index @@ -130,7 +130,7 @@ impl Schema { pub fn new_inserting_at_index( &self, index: usize, - name: SmartString, + name: PlSmallStr, dtype: DataType, ) -> PolarsResult { polars_ensure!( @@ -168,7 +168,7 @@ impl Schema { pub fn insert_at_index( &mut self, mut index: usize, - name: SmartString, + name: PlSmallStr, dtype: DataType, ) -> PolarsResult> { polars_ensure!( @@ -211,14 +211,14 @@ impl Schema { /// Return all data about the field named `name`: its index in the schema, its name, and its dtype. /// /// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't. - pub fn get_full(&self, name: &str) -> Option<(usize, &SmartString, &DataType)> { + pub fn get_full(&self, name: &str) -> Option<(usize, &PlSmallStr, &DataType)> { self.inner.get_full(name) } /// Return all data about the field named `name`: its index in the schema, its name, and its dtype. /// /// Returns `Ok((index, &name, &dtype))` if the field exists, `Err(PolarsErr)` if it doesn't. - pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &SmartString, &DataType)> { + pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &PlSmallStr, &DataType)> { self.inner .get_full(name) .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) @@ -232,8 +232,8 @@ impl Schema { /// [`get`][Self::get] or [`get_full`][Self::get_full]. pub fn get_field(&self, name: &str) -> Option { self.inner - .get(name) - .map(|dtype| Field::new(name, dtype.clone())) + .get_full(name) + .map(|(_, name, dtype)| Field::new(name.clone(), dtype.clone())) } /// Look up the name in the schema and return an owned [`Field`] by cloning the data. @@ -244,20 +244,20 @@ impl Schema { /// [`get`][Self::get] or [`get_full`][Self::get_full]. pub fn try_get_field(&self, name: &str) -> PolarsResult { self.inner - .get(name) + .get_full(name) .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) - .map(|dtype| Field::new(name, dtype.clone())) + .map(|(_, name, dtype)| Field::new(name.clone(), dtype.clone())) } /// Get references to the name and dtype of the field at `index`. /// /// If `index` is inbounds, returns `Some((&name, &dtype))`, else `None`. See /// [`get_at_index_mut`][Self::get_at_index_mut] for a mutable version. - pub fn get_at_index(&self, index: usize) -> Option<(&SmartString, &DataType)> { + pub fn get_at_index(&self, index: usize) -> Option<(&PlSmallStr, &DataType)> { self.inner.get_index(index) } - pub fn try_get_at_index(&self, index: usize) -> PolarsResult<(&SmartString, &DataType)> { + pub fn try_get_at_index(&self, index: usize) -> PolarsResult<(&PlSmallStr, &DataType)> { self.inner.get_index(index).ok_or_else(|| polars_err!(ComputeError: "index {index} out of bounds with 'schema' of len: {}", self.len())) } @@ -265,7 +265,7 @@ impl Schema { /// /// If `index` is inbounds, returns `Some((&mut name, &mut dtype))`, else `None`. See /// [`get_at_index`][Self::get_at_index] for an immutable version. - pub fn get_at_index_mut(&mut self, index: usize) -> Option<(&mut SmartString, &mut DataType)> { + pub fn get_at_index_mut(&mut self, index: usize) -> Option<(&mut PlSmallStr, &mut DataType)> { self.inner.get_index_mut2(index) } @@ -296,7 +296,7 @@ impl Schema { /// /// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a /// faster, but not order-preserving, method, use [`remove`][Self::remove]. - pub fn shift_remove_index(&mut self, index: usize) -> Option<(SmartString, DataType)> { + pub fn shift_remove_index(&mut self, index: usize) -> Option<(PlSmallStr, DataType)> { self.inner.shift_remove_index(index) } @@ -341,7 +341,7 @@ impl Schema { /// To enforce the index of the resulting field, use [`insert_at_index`][Self::insert_at_index]. /// /// Computes in **O(1)** time (amortized average). - pub fn with_column(&mut self, name: SmartString, dtype: DataType) -> Option { + pub fn with_column(&mut self, name: PlSmallStr, dtype: DataType) -> Option { self.inner.insert(name, dtype) } @@ -376,7 +376,7 @@ impl Schema { let fields: Vec<_> = self .inner .iter() - .map(|(name, dtype)| dtype.to_arrow_field(name.as_str(), compat_level)) + .map(|(name, dtype)| dtype.to_arrow_field(name.clone(), compat_level)) .collect(); ArrowSchema::from(fields) } @@ -388,7 +388,7 @@ impl Schema { pub fn iter_fields(&self) -> impl ExactSizeIterator + '_ { self.inner .iter() - .map(|(name, dtype)| Field::new(name, dtype.clone())) + .map(|(name, dtype)| Field::new(name.clone(), dtype.clone())) } /// Iterates over references to the dtypes in this schema. @@ -402,14 +402,14 @@ impl Schema { } /// Iterates over references to the names in this schema. - pub fn iter_names(&self) -> impl '_ + ExactSizeIterator { + pub fn iter_names(&self) -> impl '_ + ExactSizeIterator { self.inner.iter().map(|(name, _dtype)| name) } /// Iterates over the `(&name, &dtype)` pairs in this schema. /// /// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s - pub fn iter(&self) -> impl Iterator + '_ { + pub fn iter(&self) -> impl Iterator + '_ { self.inner.iter() } @@ -439,11 +439,11 @@ impl Schema { .into_iter() .map(|c| { let name = c.as_ref(); - let dtype = self + let (_, name, dtype) = self .inner - .get(name) + .get_full(name) .ok_or_else(|| polars_err!(col_not_found = name))?; - PolarsResult::Ok((SmartString::from(name), dtype.clone())) + PolarsResult::Ok((name.clone(), dtype.clone())) }) .try_collect()?, }) @@ -453,8 +453,8 @@ impl Schema { pub type SchemaRef = Arc; impl IntoIterator for Schema { - type Item = (SmartString, DataType); - type IntoIter = as IntoIterator>::IntoIter; + type Item = (PlSmallStr, DataType); + type IntoIter = as IntoIterator>::IntoIter; fn into_iter(self) -> Self::IntoIter { self.inner.into_iter() @@ -467,7 +467,11 @@ pub trait IndexOfSchema: Debug { fn index_of(&self, name: &str) -> Option; /// Get a vector of all column names. - fn get_names(&self) -> Vec<&str>; + fn get_names(&self) -> Vec<&PlSmallStr>; + + fn get_names_str(&self) -> Vec<&str>; + + fn get_names_owned(&self) -> Vec; fn try_index_of(&self, name: &str) -> PolarsResult { self.index_of(name).ok_or_else(|| { @@ -484,17 +488,33 @@ impl IndexOfSchema for Schema { self.inner.get_index_of(name) } - fn get_names(&self) -> Vec<&str> { - self.iter_names().map(|name| name.as_str()).collect() + fn get_names(&self) -> Vec<&PlSmallStr> { + self.iter_names().collect() + } + + fn get_names_owned(&self) -> Vec { + self.iter_names().cloned().collect() + } + + fn get_names_str(&self) -> Vec<&str> { + self.iter_names().map(|x| x.as_str()).collect() } } impl IndexOfSchema for ArrowSchema { fn index_of(&self, name: &str) -> Option { - self.fields.iter().position(|f| f.name == name) + self.fields.iter().position(|f| f.name.as_str() == name) + } + + fn get_names(&self) -> Vec<&PlSmallStr> { + self.fields.iter().map(|f| &f.name).collect() + } + + fn get_names_owned(&self) -> Vec { + self.fields.iter().map(|f| f.name.clone()).collect() } - fn get_names(&self) -> Vec<&str> { + fn get_names_str(&self) -> Vec<&str> { self.fields.iter().map(|f| f.name.as_str()).collect() } } diff --git a/crates/polars-core/src/serde/chunked_array.rs b/crates/polars-core/src/serde/chunked_array.rs index 15b8358d62e3..145f05c9af38 100644 --- a/crates/polars-core/src/serde/chunked_array.rs +++ b/crates/polars-core/src/serde/chunked_array.rs @@ -46,7 +46,7 @@ where fn serialize_impl( serializer: S, - name: &str, + name: &PlSmallStr, dtype: &DataType, bit_settings: MetadataFlags, ca: &ChunkedArray, diff --git a/crates/polars-core/src/serde/mod.rs b/crates/polars-core/src/serde/mod.rs index b0157956d8cf..86fbf5c52007 100644 --- a/crates/polars-core/src/serde/mod.rs +++ b/crates/polars-core/src/serde/mod.rs @@ -10,14 +10,14 @@ mod test { #[test] fn test_serde() -> PolarsResult<()> { - let ca = UInt32Chunked::new("foo", &[Some(1), None, Some(2)]); + let ca = UInt32Chunked::new("foo".into(), &[Some(1), None, Some(2)]); let json = serde_json::to_string(&ca).unwrap(); let out = serde_json::from_str::(&json).unwrap(); assert!(ca.into_series().equals_missing(&out)); - let ca = StringChunked::new("foo", &[Some("foo"), None, Some("bar")]); + let ca = StringChunked::new("foo".into(), &[Some("foo"), None, Some("bar")]); let json = serde_json::to_string(&ca).unwrap(); @@ -30,7 +30,7 @@ mod test { /// test using the `DeserializedOwned` trait #[test] fn test_serde_owned() { - let ca = UInt32Chunked::new("foo", &[Some(1), None, Some(2)]); + let ca = UInt32Chunked::new("foo".into(), &[Some(1), None, Some(2)]); let json = serde_json::to_string(&ca).unwrap(); @@ -39,10 +39,10 @@ mod test { } fn sample_dataframe() -> DataFrame { - let s1 = Series::new("foo", &[1, 2, 3]); - let s2 = Series::new("bar", &[Some(true), None, Some(false)]); - let s3 = Series::new("string", &["mouse", "elephant", "dog"]); - let s_list = Series::new("list", &[s1.clone(), s1.clone(), s1.clone()]); + let s1 = Series::new("foo".into(), &[1, 2, 3]); + let s2 = Series::new("bar".into(), &[Some(true), None, Some(false)]); + let s3 = Series::new("string".into(), &["mouse", "elephant", "dog"]); + let s_list = Series::new("list".into(), &[s1.clone(), s1.clone(), s1.clone()]); DataFrame::new(vec![s1, s2, s3, s_list]).unwrap() } @@ -90,7 +90,7 @@ mod test { #[test] fn test_serde_binary_series_owned_bincode() { let s1 = Series::new( - "foo", + "foo".into(), &[ vec![1u8, 2u8, 3u8], vec![4u8, 5u8, 6u8, 7u8], @@ -115,15 +115,15 @@ mod test { AnyValue::String("1:3"), ], vec![ - Field::new("fld_1", DataType::String), - Field::new("fld_2", DataType::String), - Field::new("fld_3", DataType::String), + Field::new("fld_1".into(), DataType::String), + Field::new("fld_2".into(), DataType::String), + Field::new("fld_3".into(), DataType::String), ], ))); let dtype = DataType::Struct(vec![ - Field::new("fld_1", DataType::String), - Field::new("fld_2", DataType::String), - Field::new("fld_3", DataType::String), + Field::new("fld_1".into(), DataType::String), + Field::new("fld_2".into(), DataType::String), + Field::new("fld_3".into(), DataType::String), ]); let row_2 = AnyValue::StructOwned(Box::new(( vec![ @@ -132,15 +132,16 @@ mod test { AnyValue::String("2:3"), ], vec![ - Field::new("fld_1", DataType::String), - Field::new("fld_2", DataType::String), - Field::new("fld_3", DataType::String), + Field::new("fld_1".into(), DataType::String), + Field::new("fld_2".into(), DataType::String), + Field::new("fld_3".into(), DataType::String), ], ))); let row_3 = AnyValue::Null; - let s = Series::from_any_values_and_dtype("item", &[row_1, row_2, row_3], &dtype, false) - .unwrap(); + let s = + Series::from_any_values_and_dtype("item".into(), &[row_1, row_2, row_3], &dtype, false) + .unwrap(); let df = DataFrame::new(vec![s]).unwrap(); let df_str = serde_json::to_string(&df).unwrap(); diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 49e9b6d004be..3506a0e9cc89 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -146,95 +146,96 @@ impl<'de> Deserialize<'de> for Series { return Err(de::Error::missing_field("values")); } let name = name.ok_or_else(|| de::Error::missing_field("name"))?; + let name = PlSmallStr::from_str(name.as_ref()); let dtype = dtype.ok_or_else(|| de::Error::missing_field("datatype"))?; let mut s = match dtype { #[cfg(feature = "dtype-i8")] DataType::Int8 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, #[cfg(feature = "dtype-u8")] DataType::UInt8 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, #[cfg(feature = "dtype-i16")] DataType::Int16 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, #[cfg(feature = "dtype-u16")] DataType::UInt16 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::Int32 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::UInt32 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::Int64 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::UInt64 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, #[cfg(feature = "dtype-date")] DataType::Date => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values).cast(&DataType::Date).unwrap()) + Ok(Series::new(name, values).cast(&DataType::Date).unwrap()) }, #[cfg(feature = "dtype-datetime")] DataType::Datetime(tu, tz) => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values) + Ok(Series::new(name, values) .cast(&DataType::Datetime(tu, tz)) .unwrap()) }, #[cfg(feature = "dtype-duration")] DataType::Duration(tu) => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values) + Ok(Series::new(name, values) .cast(&DataType::Duration(tu)) .unwrap()) }, #[cfg(feature = "dtype-time")] DataType::Time => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values).cast(&DataType::Time).unwrap()) + Ok(Series::new(name, values).cast(&DataType::Time).unwrap()) }, #[cfg(feature = "dtype-decimal")] DataType::Decimal(precision, Some(scale)) => { let values: Vec> = map.next_value()?; - Ok(ChunkedArray::from_slice_options(&name, &values) + Ok(ChunkedArray::from_slice_options(name, &values) .into_decimal_unchecked(precision, scale) .into_series()) }, DataType::Boolean => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::Float32 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::Float64 => { let values: Vec> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::String => { let values: Vec>> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, DataType::List(inner) => { let values: Vec> = map.next_value()?; - let mut lb = AnonymousListBuilder::new(&name, values.len(), Some(*inner)); + let mut lb = AnonymousListBuilder::new(name, values.len(), Some(*inner)); for value in &values { lb.append_opt_series(value.as_ref()).map_err(|e| { de::Error::custom(format!("could not append series to list: {e}")) @@ -246,7 +247,7 @@ impl<'de> Deserialize<'de> for Series { DataType::Array(inner, width) => { let values: Vec> = map.next_value()?; let mut builder = - get_fixed_size_list_builder(&inner, values.len(), width, &name) + get_fixed_size_list_builder(&inner, values.len(), width, name) .map_err(|e| { de::Error::custom(format!( "could not get supported list builder: {e}" @@ -271,25 +272,25 @@ impl<'de> Deserialize<'de> for Series { }, DataType::Binary => { let values: Vec>> = map.next_value()?; - Ok(Series::new(&name, values)) + Ok(Series::new(name, values)) }, #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { let values: Vec = map.next_value()?; - let ca = StructChunked::from_series(&name, &values).unwrap(); + let ca = StructChunked::from_series(name.clone(), &values).unwrap(); let mut s = ca.into_series(); - s.rename(&name); + s.rename(name); Ok(s) }, #[cfg(feature = "dtype-categorical")] dt @ (DataType::Categorical(_, _) | DataType::Enum(_, _)) => { let values: Vec>> = map.next_value()?; - Ok(Series::new(&name, values).cast(&dt).unwrap()) + Ok(Series::new(name, values).cast(&dt).unwrap()) }, DataType::Null => { let values: Vec = map.next_value()?; let len = values.first().unwrap(); - Ok(Series::new_null(&name, *len)) + Ok(Series::new_null(name, *len)) }, dt => Err(A::Error::custom(format!( "deserializing data of type {dt} is not supported" diff --git a/crates/polars-core/src/series/amortized_iter.rs b/crates/polars-core/src/series/amortized_iter.rs index 7cdf8507c29f..7d32bfcb4bf5 100644 --- a/crates/polars-core/src/series/amortized_iter.rs +++ b/crates/polars-core/src/series/amortized_iter.rs @@ -51,8 +51,8 @@ impl AmortSeries { let s = &(*self.container); debug_assert_eq!(s.chunks().len(), 1); let array_ref = s.chunks().get_unchecked(0).clone(); - let name = s.name(); - Series::from_chunks_and_dtype_unchecked(name, vec![array_ref], s.dtype()) + let name = s.name().clone(); + Series::from_chunks_and_dtype_unchecked(name.clone(), vec![array_ref], s.dtype()) } } @@ -93,7 +93,7 @@ impl AmortSeries { // SAFETY: // type must be matching pub(crate) unsafe fn unstable_series_container_and_ptr( - name: &str, + name: PlSmallStr, inner_values: ArrayRef, iter_dtype: &DataType, ) -> (Series, *mut ArrayRef) { diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 83abf75e980d..d60be8ae08dd 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -18,7 +18,7 @@ impl<'a, T: AsRef<[AnyValue<'a>]>> NamedFrom]> for Series { /// of [`DataType::Null`], which is always allowed). /// /// [`AnyValue`]: crate::datatypes::AnyValue - fn new(name: &str, values: T) -> Self { + fn new(name: PlSmallStr, values: T) -> Self { let values = values.as_ref(); Series::from_any_values(name, values, true).expect("data types of values should match") } @@ -36,7 +36,11 @@ impl Series { /// An error is returned if no supertype can be determined. /// **WARNING**: A full pass over the values is required to determine the supertype. /// - If no values were passed, the resulting data type is `Null`. - pub fn from_any_values(name: &str, values: &[AnyValue], strict: bool) -> PolarsResult { + pub fn from_any_values( + name: PlSmallStr, + values: &[AnyValue], + strict: bool, + ) -> PolarsResult { fn get_first_non_null_dtype(values: &[AnyValue]) -> DataType { let mut all_flat_null = true; let first_non_null = values.iter().find(|av| { @@ -82,7 +86,7 @@ impl Series { /// data type. If `strict` is `false`, values that do not match the given data type /// are cast. If casting is not possible, the values are set to null instead. pub fn from_any_values_and_dtype( - name: &str, + name: PlSmallStr, values: &[AnyValue], dtype: &DataType, strict: bool, @@ -158,7 +162,7 @@ impl Series { DataType::Struct(fields) => any_values_to_struct(values, fields, strict)?, #[cfg(feature = "object")] DataType::Object(_, registry) => any_values_to_object(values, registry)?, - DataType::Null => Series::new_null(name, values.len()), + DataType::Null => Series::new_null(PlSmallStr::const_default(), values.len()), dt => { polars_bail!( InvalidOperation: @@ -185,9 +189,10 @@ fn any_values_to_integer( fn any_values_to_integer_strict( values: &[AnyValue], ) -> PolarsResult> { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); for av in values { - match av { + match &av { av if av.is_integer() => { let opt_val = av.extract::(); let val = match opt_val { @@ -212,7 +217,8 @@ fn any_values_to_integer( fn any_values_to_f32(values: &[AnyValue], strict: bool) -> PolarsResult { fn any_values_to_f32_strict(values: &[AnyValue]) -> PolarsResult { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Float32(i) => builder.append_value(*i), @@ -230,7 +236,8 @@ fn any_values_to_f32(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { fn any_values_to_f64_strict(values: &[AnyValue]) -> PolarsResult { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Float64(i) => builder.append_value(*i), @@ -249,7 +256,7 @@ fn any_values_to_f64(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { - let mut builder = BooleanChunkedBuilder::new("", values.len()); + let mut builder = BooleanChunkedBuilder::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Boolean(b) => builder.append_value(*b), @@ -270,7 +277,7 @@ fn any_values_to_bool(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { fn any_values_to_string_strict(values: &[AnyValue]) -> PolarsResult { - let mut builder = StringChunkedBuilder::new("", values.len()); + let mut builder = StringChunkedBuilder::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::String(s) => builder.append_value(s), @@ -282,7 +289,7 @@ fn any_values_to_string(values: &[AnyValue], strict: bool) -> PolarsResult StringChunked { - let mut builder = StringChunkedBuilder::new("", values.len()); + let mut builder = StringChunkedBuilder::new(PlSmallStr::const_default(), values.len()); let mut owned = String::new(); // Amortize allocations. for av in values { match av { @@ -308,7 +315,7 @@ fn any_values_to_string(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { fn any_values_to_binary_strict(values: &[AnyValue]) -> PolarsResult { - let mut builder = BinaryChunkedBuilder::new("", values.len()); + let mut builder = BinaryChunkedBuilder::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Binary(s) => builder.append_value(*s), @@ -326,7 +333,7 @@ fn any_values_to_binary(values: &[AnyValue], strict: bool) -> PolarsResult Some(*b), AnyValue::BinaryOwned(b) => Some(&**b), AnyValue::String(s) => Some(s.as_bytes()), - AnyValue::StringOwned(s) => Some(s.as_bytes()), + AnyValue::StringOwned(s) => Some(s.as_str().as_bytes()), _ => None, }) .collect_trusted() @@ -340,7 +347,8 @@ fn any_values_to_binary(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Date(i) => builder.append_value(*i), @@ -361,7 +369,8 @@ fn any_values_to_date(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Time(i) => builder.append_value(*i), @@ -387,7 +396,8 @@ fn any_values_to_datetime( time_zone: Option, strict: bool, ) -> PolarsResult { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); let target_dtype = DataType::Datetime(time_unit, time_zone.clone()); for av in values { match av { @@ -413,7 +423,8 @@ fn any_values_to_duration( time_unit: TimeUnit, strict: bool, ) -> PolarsResult { - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); let target_dtype = DataType::Duration(time_unit); for av in values { match av { @@ -485,7 +496,8 @@ fn any_values_to_decimal( }; let target_dtype = DataType::Decimal(precision, Some(scale)); - let mut builder = PrimitiveChunkedBuilder::::new("", values.len()); + let mut builder = + PrimitiveChunkedBuilder::::new(PlSmallStr::const_default(), values.len()); for av in values { match av { // Allow equal or less scale. We do want to support different scales even in 'strict' mode. @@ -526,9 +538,10 @@ fn any_values_to_list( // Structs don't support empty fields yet. // We must ensure the data-types match what we do physical #[cfg(feature = "dtype-struct")] - DataType::Struct(fields) if fields.is_empty() => { - DataType::Struct(vec![Field::new("", DataType::Null)]) - }, + DataType::Struct(fields) if fields.is_empty() => DataType::Struct(vec![Field::new( + PlSmallStr::const_default(), + DataType::Null, + )]), _ => inner_type.clone(), }; let target_dtype = DataType::List(Box::new(it)); @@ -558,7 +571,9 @@ fn any_values_to_list( } else { match b.cast(inner_type) { Ok(out) => Some(out), - Err(_) => Some(Series::full_null(b.name(), b.len(), inner_type)), + Err(_) => { + Some(Series::full_null(b.name().clone(), b.len(), inner_type)) + }, } } }, @@ -617,7 +632,7 @@ fn any_values_to_array( None }, }) - .collect_ca_with_dtype("", target_dtype.clone()) + .collect_ca_with_dtype(PlSmallStr::const_default(), target_dtype.clone()) } // Make sure that wrongly inferred AnyValues don't deviate from the datatype. else { @@ -629,7 +644,7 @@ fn any_values_to_array( } else { let s = match b.cast(inner_type) { Ok(out) => out, - Err(_) => Series::full_null(b.name(), b.len(), inner_type), + Err(_) => Series::full_null(b.name().clone(), b.len(), inner_type), }; to_arr(&s) } @@ -640,7 +655,7 @@ fn any_values_to_array( None }, }) - .collect_ca_with_dtype("", target_dtype.clone()) + .collect_ca_with_dtype(PlSmallStr::const_default(), target_dtype.clone()) }; if strict && !valid { @@ -670,7 +685,9 @@ fn any_values_to_struct( ) -> PolarsResult { // Fast path for structs with no fields. if fields.is_empty() { - return Ok(StructChunked::full_null("", values.len()).into_series()); + return Ok( + StructChunked::full_null(PlSmallStr::const_default(), values.len()).into_series(), + ); } // The physical series fields of the struct. @@ -723,14 +740,19 @@ fn any_values_to_struct( } // If the inferred dtype is null, we let auto inference work. let s = if matches!(field.dtype, DataType::Null) { - Series::from_any_values(field.name(), &field_avs, strict)? + Series::from_any_values(field.name().clone(), &field_avs, strict)? } else { - Series::from_any_values_and_dtype(field.name(), &field_avs, &field.dtype, strict)? + Series::from_any_values_and_dtype( + field.name().clone(), + &field_avs, + &field.dtype, + strict, + )? }; series_fields.push(s) } - let mut out = StructChunked::from_series("", &series_fields)?; + let mut out = StructChunked::from_series(PlSmallStr::const_default(), &series_fields)?; if has_outer_validity { let mut validity = MutableBitmap::new(); validity.extend_constant(values.len(), true); @@ -753,7 +775,8 @@ fn any_values_to_object( None => { use crate::chunked_array::object::registry; let converter = registry::get_object_converter(); - let mut builder = registry::get_object_builder("", values.len()); + let mut builder = + registry::get_object_builder(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Object(val) => builder.append_value(val.as_any()), @@ -769,7 +792,8 @@ fn any_values_to_object( builder }, Some(registry) => { - let mut builder = (*registry.builder_constructor)("", values.len()); + let mut builder = + (*registry.builder_constructor)(PlSmallStr::const_default(), values.len()); for av in values { match av { AnyValue::Object(val) => builder.append_value(val.as_any()), diff --git a/crates/polars-core/src/series/arithmetic/borrowed.rs b/crates/polars-core/src/series/arithmetic/borrowed.rs index 6cecab742ffd..6003d0b05792 100644 --- a/crates/polars-core/src/series/arithmetic/borrowed.rs +++ b/crates/polars-core/src/series/arithmetic/borrowed.rs @@ -548,7 +548,7 @@ impl Mul for &Series { (_, Duration(_)) => { // swap order let out = rhs.multiply(self)?; - Ok(out.with_name(self.name())) + Ok(out.with_name(self.name().clone())) }, _ => { let (lhs, rhs) = coerce_lhs_rhs(self, rhs)?; @@ -892,7 +892,7 @@ mod test { #[allow(clippy::eq_op)] fn test_arithmetic_series() -> PolarsResult<()> { // Series +-/* Series - let s = Series::new("foo", [1, 2, 3]); + let s = Series::new("foo".into(), [1, 2, 3]); assert_eq!( Vec::from((&s * &s)?.i32().unwrap()), [Some(1), Some(4), Some(9)] @@ -949,9 +949,9 @@ mod test { [Some(0), Some(1), Some(1)] ); - assert_eq!((&s * &s)?.name(), "foo"); - assert_eq!((&s * 1).name(), "foo"); - assert_eq!((1.div(&s)).name(), "foo"); + assert_eq!((&s * &s)?.name().as_str(), "foo"); + assert_eq!((&s * 1).name().as_str(), "foo"); + assert_eq!((1.div(&s)).name().as_str(), "foo"); Ok(()) } @@ -959,13 +959,13 @@ mod test { #[test] #[cfg(feature = "checked_arithmetic")] fn test_checked_div() { - let s = Series::new("foo", [1i32, 0, 1]); + let s = Series::new("foo".into(), [1i32, 0, 1]); let out = s.checked_div(&s).unwrap(); assert_eq!(Vec::from(out.i32().unwrap()), &[Some(1), None, Some(1)]); let out = s.checked_div_num(0).unwrap(); assert_eq!(Vec::from(out.i32().unwrap()), &[None, None, None]); - let s_f32 = Series::new("float32", [1.0f32, 0.0, 1.0]); + let s_f32 = Series::new("float32".into(), [1.0f32, 0.0, 1.0]); let out = s_f32.checked_div(&s_f32).unwrap(); assert_eq!( Vec::from(out.f32().unwrap()), @@ -974,7 +974,7 @@ mod test { let out = s_f32.checked_div_num(0.0f32).unwrap(); assert_eq!(Vec::from(out.f32().unwrap()), &[None, None, None]); - let s_f64 = Series::new("float64", [1.0f64, 0.0, 1.0]); + let s_f64 = Series::new("float64".into(), [1.0f64, 0.0, 1.0]); let out = s_f64.checked_div(&s_f64).unwrap(); assert_eq!( Vec::from(out.f64().unwrap()), diff --git a/crates/polars-core/src/series/comparison.rs b/crates/polars-core/src/series/comparison.rs index cdb5aea3bcc9..6ccb4db7c219 100644 --- a/crates/polars-core/src/series/comparison.rs +++ b/crates/polars-core/src/series/comparison.rs @@ -17,21 +17,21 @@ macro_rules! impl_compare { .categorical() .unwrap() .$method(rhs.categorical().unwrap())? - .with_name(lhs.name())); + .with_name(lhs.name().clone())); }, (Categorical(_, _) | Enum(_, _), String) => { return Ok(lhs .categorical() .unwrap() .$method(rhs.str().unwrap())? - .with_name(lhs.name())); + .with_name(lhs.name().clone())); }, (String, Categorical(_, _) | Enum(_, _)) => { return Ok(rhs .categorical() .unwrap() .$method(lhs.str().unwrap())? - .with_name(lhs.name())); + .with_name(lhs.name().clone())); }, _ => (), }; @@ -80,7 +80,7 @@ macro_rules! impl_compare { dt => polars_bail!(InvalidOperation: "could not apply comparison on series of dtype '{}; operand names: '{}', '{}'", dt, lhs.name(), rhs.name()), }; - out.rename(lhs.name()); + out.rename(lhs.name().clone()); PolarsResult::Ok(out) }}; } @@ -240,7 +240,7 @@ impl ChunkCompare<&str> for Series { DataType::Categorical(_, _) | DataType::Enum(_, _) => { self.categorical().unwrap().equal(rhs) }, - _ => Ok(BooleanChunked::full(self.name(), false, self.len())), + _ => Ok(BooleanChunked::full(self.name().clone(), false, self.len())), } } @@ -252,7 +252,11 @@ impl ChunkCompare<&str> for Series { DataType::Categorical(_, _) | DataType::Enum(_, _) => { self.categorical().unwrap().equal_missing(rhs) }, - _ => Ok(replace_non_null(self.name(), self.0.chunks(), false)), + _ => Ok(replace_non_null( + self.name().clone(), + self.0.chunks(), + false, + )), } } @@ -264,7 +268,7 @@ impl ChunkCompare<&str> for Series { DataType::Categorical(_, _) | DataType::Enum(_, _) => { self.categorical().unwrap().not_equal(rhs) }, - _ => Ok(BooleanChunked::full(self.name(), true, self.len())), + _ => Ok(BooleanChunked::full(self.name().clone(), true, self.len())), } } @@ -276,7 +280,7 @@ impl ChunkCompare<&str> for Series { DataType::Categorical(_, _) | DataType::Enum(_, _) => { self.categorical().unwrap().not_equal_missing(rhs) }, - _ => Ok(replace_non_null(self.name(), self.0.chunks(), true)), + _ => Ok(replace_non_null(self.name().clone(), self.0.chunks(), true)), } } diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index 5062b7230476..335475b549d3 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -31,7 +31,7 @@ impl Series { /// /// The caller must ensure that the given `dtype`'s physical type matches all the `ArrayRef` dtypes. pub unsafe fn from_chunks_and_dtype_unchecked( - name: &str, + name: PlSmallStr, chunks: Vec, dtype: &DataType, ) -> Self { @@ -121,7 +121,7 @@ impl Series { // (the pid is checked before dereference) { let pe = PolarsExtension::new(arr.clone()); - let s = pe.get_series(name); + let s = pe.get_series(&name); pe.take_and_forget(); s } @@ -138,7 +138,7 @@ impl Series { /// # Safety /// The caller must ensure that the given `dtype` matches all the `ArrayRef` dtypes. pub unsafe fn _try_from_arrow_unchecked( - name: &str, + name: PlSmallStr, chunks: Vec, dtype: &ArrowDataType, ) -> PolarsResult { @@ -150,7 +150,7 @@ impl Series { /// # Safety /// The caller must ensure that the given `dtype` matches all the `ArrayRef` dtypes. pub unsafe fn _try_from_arrow_unchecked_with_md( - name: &str, + name: PlSmallStr, chunks: Vec, dtype: &ArrowDataType, md: Option<&Metadata>, @@ -393,7 +393,7 @@ impl Series { // (the pid is checked before dereference) let s = { let pe = PolarsExtension::new(arr.clone()); - let s = pe.get_series(name); + let s = pe.get_series(&name); pe.take_and_forget(); s }; @@ -459,7 +459,7 @@ impl Series { } } -fn map_arrays_to_series(name: &str, chunks: Vec) -> PolarsResult { +fn map_arrays_to_series(name: PlSmallStr, chunks: Vec) -> PolarsResult { let chunks = chunks .iter() .map(|arr| { @@ -504,7 +504,12 @@ unsafe fn to_physical_and_dtype( feature_gated!("dtype-categorical", { let s = unsafe { let dt = dt.clone(); - Series::_try_from_arrow_unchecked_with_md("", arrays, &dt, md) + Series::_try_from_arrow_unchecked_with_md( + PlSmallStr::const_default(), + arrays, + &dt, + md, + ) } .unwrap(); (s.chunks().clone(), s.dtype().clone()) @@ -596,7 +601,9 @@ unsafe fn to_physical_and_dtype( let arrow_fields = values .iter() .zip(_fields.iter()) - .map(|(arr, field)| ArrowField::new(&field.name, arr.data_type().clone(), true)) + .map(|(arr, field)| { + ArrowField::new(field.name.clone(), arr.data_type().clone(), true) + }) .collect(); let arrow_array = Box::new(StructArray::new( ArrowDataType::Struct(arrow_fields), @@ -606,7 +613,7 @@ unsafe fn to_physical_and_dtype( let polars_fields = _fields .iter() .zip(dtypes) - .map(|(field, dtype)| Field::new(&field.name, dtype)) + .map(|(field, dtype)| Field::new(field.name.clone(), dtype)) .collect(); (vec![arrow_array], DataType::Struct(polars_fields)) }) @@ -620,7 +627,8 @@ unsafe fn to_physical_and_dtype( | ArrowDataType::Decimal(_, _) | ArrowDataType::Date64) => { let dt = dt.clone(); - let mut s = Series::_try_from_arrow_unchecked("", arrays, &dt).unwrap(); + let mut s = Series::_try_from_arrow_unchecked(PlSmallStr::const_default(), arrays, &dt) + .unwrap(); let dtype = s.dtype().clone(); (std::mem::take(s.chunks_mut()), dtype) }, @@ -649,10 +657,24 @@ fn check_types(chunks: &[ArrayRef]) -> PolarsResult { Ok(data_type) } -impl TryFrom<(&str, Vec)> for Series { +impl Series { + pub fn try_new( + name: PlSmallStr, + data: T, + ) -> Result>::Error> + where + (PlSmallStr, T): TryInto, + { + // # TODO + // * Remove the TryFrom impls in favor of this + <(PlSmallStr, T) as TryInto>::try_into((name, data)) + } +} + +impl TryFrom<(PlSmallStr, Vec)> for Series { type Error = PolarsError; - fn try_from(name_arr: (&str, Vec)) -> PolarsResult { + fn try_from(name_arr: (PlSmallStr, Vec)) -> PolarsResult { let (name, chunks) = name_arr; let data_type = check_types(&chunks)?; @@ -662,10 +684,10 @@ impl TryFrom<(&str, Vec)> for Series { } } -impl TryFrom<(&str, ArrayRef)> for Series { +impl TryFrom<(PlSmallStr, ArrayRef)> for Series { type Error = PolarsError; - fn try_from(name_arr: (&str, ArrayRef)) -> PolarsResult { + fn try_from(name_arr: (PlSmallStr, ArrayRef)) -> PolarsResult { let (name, arr) = name_arr; Series::try_from((name, vec![arr])) } @@ -683,7 +705,7 @@ impl TryFrom<(&ArrowField, Vec)> for Series { // dtype is checked unsafe { Series::_try_from_arrow_unchecked_with_md( - &field.name, + field.name.clone(), chunks, &data_type, Some(&field.metadata), @@ -772,7 +794,7 @@ unsafe impl IntoSeries for Series { } } -fn new_null(name: &str, chunks: &[ArrayRef]) -> Series { +fn new_null(name: PlSmallStr, chunks: &[ArrayRef]) -> Series { let len = chunks.iter().map(|arr| arr.len()).sum(); Series::new_null(name, len) } diff --git a/crates/polars-core/src/series/implementations/array.rs b/crates/polars-core/src/series/implementations/array.rs index bc3ed6d23243..351c7d51b9c9 100644 --- a/crates/polars-core/src/series/implementations/array.rs +++ b/crates/polars-core/src/series/implementations/array.rs @@ -4,7 +4,6 @@ use std::borrow::Cow; use super::{private, MetadataFlags}; use crate::chunked_array::cast::CastOptions; use crate::chunked_array::comparison::*; -use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; #[cfg(feature = "algorithm_group_by")] use crate::frame::group_by::*; @@ -30,10 +29,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } - unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { self.0.equal_element(idx_self, idx_other, other) } @@ -73,14 +68,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index fd832de9222a..221ddd25ca8b 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -21,9 +21,6 @@ impl private::PrivateSeries for SeriesWrap { fn _set_flags(&mut self, flags: MetadataFlags) { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { self.0.equal_element(idx_self, idx_other, other) @@ -99,14 +96,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/binary_offset.rs b/crates/polars-core/src/series/implementations/binary_offset.rs index 153bace7a49c..8f1fa74df7b2 100644 --- a/crates/polars-core/src/series/implementations/binary_offset.rs +++ b/crates/polars-core/src/series/implementations/binary_offset.rs @@ -62,14 +62,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index 492dac39b8c1..86ee10da02b1 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -20,9 +20,6 @@ impl private::PrivateSeries for SeriesWrap { fn _set_flags(&mut self, flags: MetadataFlags) { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { self.0.equal_element(idx_self, idx_other, other) @@ -124,14 +121,14 @@ impl SeriesTrait for SeriesWrap { Ok((&self.0).bitor(other).into_series()) } - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 16092a96a5f5..b0f7623e23fd 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -62,14 +62,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - // TODO! explode by offset should return concrete type - self.with_state(true, |cats| { - cats.explode_by_offsets(offsets).u32().unwrap().clone() - }) - .into_series() - } - unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { self.0.physical().equal_element(idx_self, idx_other, other) } @@ -133,14 +125,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.physical_mut().rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.physical().chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.physical().name() } diff --git a/crates/polars-core/src/series/implementations/date.rs b/crates/polars-core/src/series/implementations/date.rs index 3882d4976ee0..b7d761683333 100644 --- a/crates/polars-core/src/series/implementations/date.rs +++ b/crates/polars-core/src/series/implementations/date.rs @@ -39,10 +39,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets).into_date().into_series() - } - #[cfg(feature = "zip_with")] fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { let other = other.to_physical_repr().into_owned(); @@ -144,14 +140,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index bc35975c5eb3..eed12d8586c5 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -32,13 +32,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0 - .explode_by_offsets(offsets) - .into_datetime(self.0.time_unit(), self.0.time_zone().clone()) - .into_series() - } - #[cfg(feature = "zip_with")] fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { let other = other.to_physical_repr().into_owned(); @@ -145,14 +138,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/decimal.rs b/crates/polars-core/src/series/implementations/decimal.rs index 324ec02dff64..a73c4b839d41 100644 --- a/crates/polars-core/src/series/implementations/decimal.rs +++ b/crates/polars-core/src/series/implementations/decimal.rs @@ -54,7 +54,11 @@ impl SeriesWrap { let arr = ca.downcast_iter().next().unwrap(); // SAFETY: dtype is passed correctly let s = unsafe { - Series::from_chunks_and_dtype_unchecked("", vec![arr.values().clone()], dtype) + Series::from_chunks_and_dtype_unchecked( + PlSmallStr::const_default(), + vec![arr.values().clone()], + dtype, + ) }; let new_values = s.array_ref(0).clone(); let data_type = @@ -67,7 +71,7 @@ impl SeriesWrap { ); unsafe { ListChunked::from_chunks_and_dtype_unchecked( - agg_s.name(), + agg_s.name().clone(), vec![Box::new(new_arr)], DataType::List(Box::new(self.dtype().clone())), ) @@ -180,21 +184,10 @@ impl private::PrivateSeries for SeriesWrap { fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult { self.0.group_tuples(multithreaded, sorted) } - - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0 - .explode_by_offsets(offsets) - .decimal() - .unwrap() - .as_ref() - .clone() - .into_decimal_unchecked(self.0.precision(), self.0.scale()) - .into_series() - } } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name) } @@ -202,7 +195,7 @@ impl SeriesTrait for SeriesWrap { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs index 4e1f59c8a6f1..81f5ee7497dd 100644 --- a/crates/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -29,13 +29,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.dtype() } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0 - .explode_by_offsets(offsets) - .into_duration(self.0.time_unit()) - .into_series() - } - fn _set_flags(&mut self, flags: MetadataFlags) { self.0.deref_mut().set_flags(flags) } @@ -259,14 +252,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index e5419462abd4..1a7f57927e47 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -23,10 +23,6 @@ macro_rules! impl_dyn_series { fn _get_flags(&self) -> MetadataFlags { self.0.get_flags() } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } - unsafe fn equal_element( &self, idx_self: usize, @@ -173,14 +169,14 @@ macro_rules! impl_dyn_series { self.metadata_dyn() } - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs index a67dc8e8f487..554e3852f8c8 100644 --- a/crates/polars-core/src/series/implementations/list.rs +++ b/crates/polars-core/src/series/implementations/list.rs @@ -21,10 +21,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } - unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { self.0.equal_element(idx_self, idx_other, other) } @@ -50,14 +46,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } @@ -191,13 +187,13 @@ impl SeriesTrait for SeriesWrap { } // this can be called in aggregation, so this fast path can be worth a lot if self.len() == 1 { - return Ok(IdxCa::new_vec(self.name(), vec![0 as IdxSize])); + return Ok(IdxCa::new_vec(self.name().clone(), vec![0 as IdxSize])); } let main_thread = POOL.current_thread_index().is_none(); // arg_unique requires a stable order let groups = self.group_tuples(main_thread, true)?; let first = groups.take_group_firsts(); - Ok(IdxCa::from_vec(self.name(), first)) + Ok(IdxCa::from_vec(self.name().clone(), first)) } fn is_null(&self) -> BooleanChunked { diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 50dd3e1c0042..1a9df0216c14 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -20,7 +20,7 @@ pub(crate) mod null; mod object; mod string; #[cfg(feature = "dtype-struct")] -mod struct__; +mod struct_; #[cfg(feature = "dtype-time")] mod time; @@ -35,7 +35,6 @@ use crate::chunked_array::metadata::MetadataTrait; use crate::chunked_array::ops::compare_inner::{ IntoTotalEqInner, IntoTotalOrdInner, TotalEqInner, TotalOrdInner, }; -use crate::chunked_array::ops::explode::ExplodeByOffsets; use crate::chunked_array::AsSinglePtr; // Utility wrapper struct @@ -90,10 +89,6 @@ macro_rules! impl_dyn_series { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } - unsafe fn equal_element( &self, idx_self: usize, @@ -277,14 +272,14 @@ macro_rules! impl_dyn_series { Ok(self.0.bitxor(&other).into_series()) } - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/null.rs b/crates/polars-core/src/series/implementations/null.rs index 0837532df2d5..75e3acb69dda 100644 --- a/crates/polars-core/src/series/implementations/null.rs +++ b/crates/polars-core/src/series/implementations/null.rs @@ -3,20 +3,19 @@ use std::any::Any; use polars_error::constants::LENGTH_LIMIT_MSG; use crate::prelude::compare_inner::{IntoTotalEqInner, TotalEqInner}; -use crate::prelude::explode::ExplodeByOffsets; use crate::prelude::*; use crate::series::private::{PrivateSeries, PrivateSeriesNumeric}; use crate::series::*; impl Series { - pub fn new_null(name: &str, len: usize) -> Series { - NullChunked::new(Arc::from(name), len).into_series() + pub fn new_null(name: PlSmallStr, len: usize) -> Series { + NullChunked::new(name, len).into_series() } } #[derive(Clone)] pub struct NullChunked { - pub(crate) name: Arc, + pub(crate) name: PlSmallStr, length: IdxSize, // we still need chunks as many series consumers expect // chunks to be there @@ -24,7 +23,7 @@ pub struct NullChunked { } impl NullChunked { - pub(crate) fn new(name: Arc, len: usize) -> Self { + pub(crate) fn new(name: PlSmallStr, len: usize) -> Self { Self { name, length: len as IdxSize, @@ -38,7 +37,7 @@ impl NullChunked { impl PrivateSeriesNumeric for NullChunked { fn bit_repr(&self) -> Option { Some(BitRepr::Small(UInt32Chunked::full_null( - self.name.as_ref(), + self.name.clone(), self.len(), ))) } @@ -56,7 +55,7 @@ impl PrivateSeries for NullChunked { self.length = IdxSize::try_from(inner(&self.chunks)).expect(LENGTH_LIMIT_MSG); } fn _field(&self) -> Cow { - Cow::Owned(Field::new(self.name(), DataType::Null)) + Cow::Owned(Field::new(self.name().clone(), DataType::Null)) } #[allow(unused)] @@ -78,12 +77,8 @@ impl PrivateSeries for NullChunked { }, }; - Ok(Self::new(self.name().into(), len).into_series()) + Ok(Self::new(self.name().clone(), len).into_series()) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - ExplodeByOffsets::explode_by_offsets(self, offsets) - } - fn subtract(&self, _rhs: &Series) -> PolarsResult { null_arithmetic(self, _rhs, "subtract") } @@ -148,16 +143,16 @@ fn null_arithmetic(lhs: &NullChunked, rhs: &Series, op: &str) -> PolarsResult len_l, _ => polars_bail!(ComputeError: "Cannot {:?} two series of different lengths.", op), }; - Ok(NullChunked::new(lhs.name().into(), output_len).into_series()) + Ok(NullChunked::new(lhs.name().clone(), output_len).into_series()) } impl SeriesTrait for NullChunked { - fn name(&self) -> &str { - self.name.as_ref() + fn name(&self) -> &PlSmallStr { + &self.name } - fn rename(&mut self, name: &str) { - self.name = Arc::from(name) + fn rename(&mut self, name: PlSmallStr) { + self.name = name } fn chunks(&self) -> &Vec { @@ -204,7 +199,7 @@ impl SeriesTrait for NullChunked { } fn cast(&self, data_type: &DataType, _cast_options: CastOptions) -> PolarsResult { - Ok(Series::full_null(self.name.as_ref(), self.len(), data_type)) + Ok(Series::full_null(self.name.clone(), self.len(), data_type)) } fn null_count(&self) -> usize { @@ -269,11 +264,11 @@ impl SeriesTrait for NullChunked { } fn is_null(&self) -> BooleanChunked { - BooleanChunked::full(self.name(), true, self.len()) + BooleanChunked::full(self.name().clone(), true, self.len()) } fn is_not_null(&self) -> BooleanChunked { - BooleanChunked::full(self.name(), false, self.len()) + BooleanChunked::full(self.name().clone(), false, self.len()) } fn reverse(&self) -> Series { diff --git a/crates/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs index 1cb747c4cb8e..b4821682693f 100644 --- a/crates/polars-core/src/series/implementations/object.rs +++ b/crates/polars-core/src/series/implementations/object.rs @@ -21,7 +21,7 @@ where { fn get_list_builder( &self, - _name: &str, + _name: PlSmallStr, _values_capacity: usize, _list_capacity: usize, ) -> Box { @@ -83,7 +83,7 @@ impl SeriesTrait for SeriesWrap> where T: PolarsObject, { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { ObjectChunked::rename(&mut self.0, name) } @@ -91,7 +91,7 @@ where ObjectChunked::chunk_lengths(&self.0) } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { ObjectChunked::name(&self.0) } @@ -246,7 +246,7 @@ mod test { } } - let ca = ObjectChunked::new_from_vec("a", vec![0i32, 1, 2]); + let ca = ObjectChunked::new_from_vec("a".into(), vec![0i32, 1, 2]); let s = ca.into_series(); let ca = s.as_any().downcast_ref::>().unwrap(); diff --git a/crates/polars-core/src/series/implementations/string.rs b/crates/polars-core/src/series/implementations/string.rs index 3a795e23092d..3cceaca32c48 100644 --- a/crates/polars-core/src/series/implementations/string.rs +++ b/crates/polars-core/src/series/implementations/string.rs @@ -21,10 +21,6 @@ impl private::PrivateSeries for SeriesWrap { fn _get_flags(&self) -> MetadataFlags { self.0.get_flags() } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets) - } - unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { self.0.equal_element(idx_self, idx_other, other) } @@ -99,14 +95,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/implementations/struct__.rs b/crates/polars-core/src/series/implementations/struct_.rs similarity index 93% rename from crates/polars-core/src/series/implementations/struct__.rs rename to crates/polars-core/src/series/implementations/struct_.rs index 07b35502dd6b..805f06d86bac 100644 --- a/crates/polars-core/src/series/implementations/struct__.rs +++ b/crates/polars-core/src/series/implementations/struct_.rs @@ -32,12 +32,6 @@ impl PrivateSeries for SeriesWrap { fn _set_flags(&mut self, _flags: MetadataFlags) {} - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self._apply_fields(|s| s.explode_by_offsets(offsets)) - .unwrap() - .into_series() - } - // TODO! remove this. Very slow. Asof join should use row-encoding. unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { let other = other.struct_().unwrap(); @@ -80,7 +74,7 @@ impl PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name) } @@ -88,7 +82,7 @@ impl SeriesTrait for SeriesWrap { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } @@ -203,12 +197,12 @@ impl SeriesTrait for SeriesWrap { fn arg_unique(&self) -> PolarsResult { // this can called in aggregation, so this fast path can be worth a lot if self.len() == 1 { - return Ok(IdxCa::new_vec(self.name(), vec![0 as IdxSize])); + return Ok(IdxCa::new_vec(self.name().clone(), vec![0 as IdxSize])); } let main_thread = POOL.current_thread_index().is_none(); let groups = self.group_tuples(main_thread, true)?; let first = groups.take_group_firsts(); - Ok(IdxCa::from_vec(self.name(), first)) + Ok(IdxCa::from_vec(self.name().clone(), first)) } fn has_nulls(&self) -> bool { @@ -223,7 +217,7 @@ impl SeriesTrait for SeriesWrap { }; BooleanArray::from_data_default(bitmap, None) }); - BooleanChunked::from_chunk_iter(self.name(), iter) + BooleanChunked::from_chunk_iter(self.name().clone(), iter) } fn is_not_null(&self) -> BooleanChunked { @@ -234,7 +228,7 @@ impl SeriesTrait for SeriesWrap { }; BooleanArray::from_data_default(bitmap, None) }); - BooleanChunked::from_chunk_iter(self.name(), iter) + BooleanChunked::from_chunk_iter(self.name().clone(), iter) } fn reverse(&self) -> Series { diff --git a/crates/polars-core/src/series/implementations/time.rs b/crates/polars-core/src/series/implementations/time.rs index c197de232ef1..e0f87a4d80f8 100644 --- a/crates/polars-core/src/series/implementations/time.rs +++ b/crates/polars-core/src/series/implementations/time.rs @@ -39,10 +39,6 @@ impl private::PrivateSeries for SeriesWrap { self.0.set_flags(flags) } - fn explode_by_offsets(&self, offsets: &[i64]) -> Series { - self.0.explode_by_offsets(offsets).into_time().into_series() - } - #[cfg(feature = "zip_with")] fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { let other = other.to_physical_repr().into_owned(); @@ -119,14 +115,14 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn rename(&mut self, name: &str) { + fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } - fn name(&self) -> &str { + fn name(&self) -> &PlSmallStr { self.0.name() } diff --git a/crates/polars-core/src/series/into.rs b/crates/polars-core/src/series/into.rs index d1f722a9bd7e..aa6084ae0896 100644 --- a/crates/polars-core/src/series/into.rs +++ b/crates/polars-core/src/series/into.rs @@ -34,7 +34,7 @@ impl Series { let dtype = &field.dtype; let s = unsafe { Series::from_chunks_and_dtype_unchecked( - "", + PlSmallStr::const_default(), vec![values.clone()], &dtype.to_physical(), ) @@ -59,7 +59,7 @@ impl Series { // We pass physical arrays and cast to logical before we convert to arrow. let s = unsafe { Series::from_chunks_and_dtype_unchecked( - "", + PlSmallStr::const_default(), vec![arr.values().clone()], &inner.to_physical(), ) @@ -84,7 +84,8 @@ impl Series { let ca = self.categorical().unwrap(); let arr = ca.physical().chunks()[chunk_idx].clone(); // SAFETY: categoricals are always u32's. - let cats = unsafe { UInt32Chunked::from_chunks("", vec![arr]) }; + let cats = + unsafe { UInt32Chunked::from_chunks(PlSmallStr::const_default(), vec![arr]) }; // SAFETY: we only take a single chunk and change nothing about the index/rev_map mapping. let new = unsafe { diff --git a/crates/polars-core/src/series/iterator.rs b/crates/polars-core/src/series/iterator.rs index 11e3a04abf43..d4dc5df63ccb 100644 --- a/crates/polars-core/src/series/iterator.rs +++ b/crates/polars-core/src/series/iterator.rs @@ -200,7 +200,7 @@ mod test { #[test] fn test_iter() { - let a = Series::new("age", [23, 71, 9].as_ref()); + let a = Series::new("age".into(), [23, 71, 9].as_ref()); let _b = a .i32() .unwrap() @@ -212,7 +212,7 @@ mod test { fn test_iter_str() { let data = [Some("John"), Some("Doe"), None]; let a: Series = data.into_iter().collect(); - let b = Series::new("", data); + let b = Series::new("".into(), data); assert_eq!(a, b); } @@ -220,7 +220,7 @@ mod test { fn test_iter_string() { let data = [Some("John".to_string()), Some("Doe".to_string()), None]; let a: Series = data.clone().into_iter().collect(); - let b = Series::new("", data); + let b = Series::new("".into(), data); assert_eq!(a, b); } } diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index dea00542db4d..42afeb4f12c4 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -52,7 +52,7 @@ use crate::POOL; /// You can do standard arithmetic on series. /// ``` /// # use polars_core::prelude::*; -/// let s = Series::new("a", [1 , 2, 3]); +/// let s = Series::new("a".into(), [1 , 2, 3]); /// let out_add = &s + &s; /// let out_sub = &s - &s; /// let out_div = &s / &s; @@ -79,7 +79,7 @@ use crate::POOL; /// /// ``` /// # use polars_core::prelude::*; -/// let s = Series::new("dollars", &[1, 2, 3]); +/// let s = Series::new("dollars".into(), &[1, 2, 3]); /// let mask = s.equal(1).unwrap(); /// let valid = [true, false, false].iter(); /// assert!(mask @@ -101,7 +101,7 @@ use crate::POOL; /// ``` /// use polars_core::prelude::*; /// let pi = 3.14; -/// let s = Series::new("angle", [2f32 * pi, pi, 1.5 * pi].as_ref()); +/// let s = Series::new("angle".into(), [2f32 * pi, pi, 1.5 * pi].as_ref()); /// let s_cos: Series = s.f32() /// .expect("series was not an f32 dtype") /// .into_iter() @@ -116,10 +116,10 @@ use crate::POOL; /// ``` /// # use polars_core::prelude::*; /// // Series can be created from Vec's, slices and arrays -/// Series::new("boolean series", &[true, false, true]); -/// Series::new("int series", &[1, 2, 3]); +/// Series::new("boolean series".into(), &[true, false, true]); +/// Series::new("int series".into(), &[1, 2, 3]); /// // And can be nullable -/// Series::new("got nulls", &[Some(1), None, Some(2)]); +/// Series::new("got nulls".into(), &[Some(1), None, Some(2)]); /// /// // Series can also be collected from iterators /// let from_iter: Series = (0..10) @@ -156,7 +156,7 @@ impl Hash for Wrap { impl Series { /// Create a new empty Series. - pub fn new_empty(name: &str, dtype: &DataType) -> Series { + pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Series { Series::full_null(name, 0, dtype) } @@ -167,9 +167,12 @@ impl Series { match self.dtype() { #[cfg(feature = "object")] DataType::Object(_, _) => self - .take(&ChunkedArray::::new_vec("", vec![])) + .take(&ChunkedArray::::new_vec( + PlSmallStr::const_default(), + vec![], + )) .unwrap(), - dt => Series::new_empty(self.name(), dt), + dt => Series::new_empty(self.name().clone(), dt), } } } @@ -259,18 +262,18 @@ impl Series { } /// Rename series. - pub fn rename(&mut self, name: &str) -> &mut Series { + pub fn rename(&mut self, name: PlSmallStr) -> &mut Series { self._get_inner_mut().rename(name); self } /// Return this Series with a new name. - pub fn with_name(mut self, name: &str) -> Series { + pub fn with_name(mut self, name: PlSmallStr) -> Series { self.rename(name); self } - /// Try to set the [`Metadata`] for the underlying [`ChunkedArray`] + /// to set the [`Metadata`] for the underlying [`ChunkedArray`] /// /// This does not guarantee that the [`Metadata`] is always set. It returns whether it was /// successful. @@ -288,16 +291,16 @@ impl Series { true } - pub fn from_arrow_chunks(name: &str, arrays: Vec) -> PolarsResult { + pub fn from_arrow_chunks(name: PlSmallStr, arrays: Vec) -> PolarsResult { Self::try_from((name, arrays)) } - pub fn from_arrow(name: &str, array: ArrayRef) -> PolarsResult { + pub fn from_arrow(name: PlSmallStr, array: ArrayRef) -> PolarsResult { Self::try_from((name, array)) } #[cfg(feature = "arrow_rs")] - pub fn from_arrow_rs(name: &str, array: &dyn arrow_array::Array) -> PolarsResult { + pub fn from_arrow_rs(name: PlSmallStr, array: &dyn arrow_array::Array) -> PolarsResult { Self::from_arrow(name, array.into()) } @@ -346,9 +349,9 @@ impl Series { /// ```rust /// # use polars_core::prelude::*; /// # fn main() -> PolarsResult<()> { - /// let s = Series::new("foo", [2, 1, 3]); + /// let s = Series::new("foo".into(), [2, 1, 3]); /// let sorted = s.sort(SortOptions::default())?; - /// assert_eq!(sorted, Series::new("foo", [1, 2, 3])); + /// assert_eq!(sorted, Series::new("foo".into(), [1, 2, 3])); /// # Ok(()) /// } /// ``` @@ -437,7 +440,7 @@ impl Series { // Always allow casting all nulls to other all nulls. let len = self.len(); if self.null_count() == len { - return Ok(Series::full_null(self.name(), len, dtype)); + return Ok(Series::full_null(self.name().clone(), len, dtype)); } let new_options = match options { @@ -540,7 +543,9 @@ impl Series { match self.dtype() { DataType::Float32 => Ok(self.f32().unwrap().is_nan()), DataType::Float64 => Ok(self.f64().unwrap().is_nan()), - dt if dt.is_numeric() => Ok(BooleanChunked::full(self.name(), false, self.len())), + dt if dt.is_numeric() => { + Ok(BooleanChunked::full(self.name().clone(), false, self.len())) + }, _ => polars_bail!(opq = is_nan, self.dtype()), } } @@ -550,7 +555,9 @@ impl Series { match self.dtype() { DataType::Float32 => Ok(self.f32().unwrap().is_not_nan()), DataType::Float64 => Ok(self.f64().unwrap().is_not_nan()), - dt if dt.is_numeric() => Ok(BooleanChunked::full(self.name(), true, self.len())), + dt if dt.is_numeric() => { + Ok(BooleanChunked::full(self.name().clone(), true, self.len())) + }, _ => polars_bail!(opq = is_not_nan, self.dtype()), } } @@ -560,7 +567,9 @@ impl Series { match self.dtype() { DataType::Float32 => Ok(self.f32().unwrap().is_finite()), DataType::Float64 => Ok(self.f64().unwrap().is_finite()), - dt if dt.is_numeric() => Ok(BooleanChunked::full(self.name(), true, self.len())), + dt if dt.is_numeric() => { + Ok(BooleanChunked::full(self.name().clone(), true, self.len())) + }, _ => polars_bail!(opq = is_finite, self.dtype()), } } @@ -570,7 +579,9 @@ impl Series { match self.dtype() { DataType::Float32 => Ok(self.f32().unwrap().is_infinite()), DataType::Float64 => Ok(self.f64().unwrap().is_infinite()), - dt if dt.is_numeric() => Ok(BooleanChunked::full(self.name(), false, self.len())), + dt if dt.is_numeric() => { + Ok(BooleanChunked::full(self.name().clone(), false, self.len())) + }, _ => polars_bail!(opq = is_infinite, self.dtype()), } } @@ -620,7 +631,7 @@ impl Series { .iter() .map(|s| s.to_physical_repr().into_owned()) .collect(); - let mut ca = StructChunked::from_series(self.name(), &fields).unwrap(); + let mut ca = StructChunked::from_series(self.name().clone(), &fields).unwrap(); if arr.null_count() > 0 { ca.zip_outer_validity(arr); @@ -643,7 +654,7 @@ impl Series { pub fn gather_every(&self, n: usize, offset: usize) -> Series { let idx = ((offset as IdxSize)..self.len() as IdxSize) .step_by(n) - .collect_ca(""); + .collect_ca(PlSmallStr::const_default()); // SAFETY: we stay in-bounds. unsafe { self.take_unchecked(&idx) } } @@ -892,7 +903,7 @@ impl Series { s.dtype().to_physical().to_arrow(CompatLevel::newest()), ); let new_arr = LargeListArray::new(data_type, offsets.into(), values, None); - let mut out = ListChunked::with_chunk(s.name(), new_arr); + let mut out = ListChunked::with_chunk(s.name().clone(), new_arr); out.set_inner_dtype(s.dtype().clone()); out } @@ -969,7 +980,7 @@ mod test { #[test] fn cast() { - let ar = UInt32Chunked::new("a", &[1, 2]); + let ar = UInt32Chunked::new("a".into(), &[1, 2]); let s = ar.into_series(); let s2 = s.cast(&DataType::Int64).unwrap(); @@ -980,9 +991,9 @@ mod test { #[test] fn new_series() { - let _ = Series::new("boolean series", &vec![true, false, true]); - let _ = Series::new("int series", &[1, 2, 3]); - let ca = Int32Chunked::new("a", &[1, 2, 3]); + let _ = Series::new("boolean series".into(), &vec![true, false, true]); + let _ = Series::new("int series".into(), &[1, 2, 3]); + let ca = Int32Chunked::new("a".into(), &[1, 2, 3]); let _ = ca.into_series(); } @@ -991,7 +1002,7 @@ mod test { fn new_series_from_empty_structs() { let dtype = DataType::Struct(vec![]); let empties = vec![AnyValue::StructOwned(Box::new((vec![], vec![]))); 3]; - let s = Series::from_any_values_and_dtype("", &empties, &dtype, false).unwrap(); + let s = Series::from_any_values_and_dtype("".into(), &empties, &dtype, false).unwrap(); assert_eq!(s.len(), 3); } #[test] @@ -999,28 +1010,28 @@ mod test { let array = UInt32Array::from_slice([1, 2, 3, 4, 5]); let array_ref: ArrayRef = Box::new(array); - let _ = Series::try_from(("foo", array_ref)).unwrap(); + let _ = Series::try_new("foo".into(), array_ref).unwrap(); } #[test] fn series_append() { - let mut s1 = Series::new("a", &[1, 2]); - let s2 = Series::new("b", &[3]); + let mut s1 = Series::new("a".into(), &[1, 2]); + let s2 = Series::new("b".into(), &[3]); s1.append(&s2).unwrap(); assert_eq!(s1.len(), 3); // add wrong type - let s2 = Series::new("b", &[3.0]); + let s2 = Series::new("b".into(), &[3.0]); assert!(s1.append(&s2).is_err()) } #[test] #[cfg(feature = "dtype-decimal")] fn series_append_decimal() { - let s1 = Series::new("a", &[1.1, 2.3]) + let s1 = Series::new("a".into(), &[1.1, 2.3]) .cast(&DataType::Decimal(None, Some(2))) .unwrap(); - let s2 = Series::new("b", &[3]) + let s2 = Series::new("b".into(), &[3]) .cast(&DataType::Decimal(None, Some(0))) .unwrap(); @@ -1040,7 +1051,7 @@ mod test { #[test] fn series_slice_works() { - let series = Series::new("a", &[1i64, 2, 3, 4, 5]); + let series = Series::new("a".into(), &[1i64, 2, 3, 4, 5]); let slice_1 = series.slice(-3, 3); let slice_2 = series.slice(-5, 5); @@ -1053,7 +1064,7 @@ mod test { #[test] fn out_of_range_slice_does_not_panic() { - let series = Series::new("a", &[1i64, 2, 3, 4, 5]); + let series = Series::new("a".into(), &[1i64, 2, 3, 4, 5]); let _ = series.slice(-3, 4); let _ = series.slice(-6, 2); diff --git a/crates/polars-core/src/series/ops/downcast.rs b/crates/polars-core/src/series/ops/downcast.rs index 6441dfe03df4..ce57e42c610c 100644 --- a/crates/polars-core/src/series/ops/downcast.rs +++ b/crates/polars-core/src/series/ops/downcast.rs @@ -36,7 +36,7 @@ impl Series { /// Unpack to [`ChunkedArray`] /// ``` /// # use polars_core::prelude::*; - /// let s = Series::new("foo", [1i32 ,2, 3]); + /// let s = Series::new("foo".into(), [1i32 ,2, 3]); /// let s_squared: Series = s.i32() /// .unwrap() /// .into_iter() diff --git a/crates/polars-core/src/series/ops/extend.rs b/crates/polars-core/src/series/ops/extend.rs index 08a196335f4c..c79385faaecf 100644 --- a/crates/polars-core/src/series/ops/extend.rs +++ b/crates/polars-core/src/series/ops/extend.rs @@ -4,7 +4,7 @@ impl Series { /// Extend with a constant value. pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { // TODO: Use `from_any_values_and_dtype` here instead of casting afterwards - let s = Series::from_any_values("", &[value], true).unwrap(); + let s = Series::from_any_values(PlSmallStr::const_default(), &[value], true).unwrap(); let s = s.cast(self.dtype())?; let to_append = s.new_from_index(0, n); diff --git a/crates/polars-core/src/series/ops/null.rs b/crates/polars-core/src/series/ops/null.rs index 0f46af8065bb..3e6a32e0a9d7 100644 --- a/crates/polars-core/src/series/ops/null.rs +++ b/crates/polars-core/src/series/ops/null.rs @@ -5,7 +5,7 @@ use crate::chunked_array::object::registry::get_object_builder; use crate::prelude::*; impl Series { - pub fn full_null(name: &str, size: usize, dtype: &DataType) -> Self { + pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Self { // match the logical types and create them match dtype { DataType::List(inner_dtype) => { @@ -53,7 +53,7 @@ impl Series { DataType::Struct(fields) => { let fields = fields .iter() - .map(|fld| Series::full_null(fld.name(), size, fld.data_type())) + .map(|fld| Series::full_null(fld.name().clone(), size, fld.data_type())) .collect::>(); let ca = StructChunked::from_series(name, &fields).unwrap(); diff --git a/crates/polars-core/src/series/ops/reshape.rs b/crates/polars-core/src/series/ops/reshape.rs index 550a12f54829..76d8d59886c7 100644 --- a/crates/polars-core/src/series/ops/reshape.rs +++ b/crates/polars-core/src/series/ops/reshape.rs @@ -15,7 +15,7 @@ use crate::chunked_array::builder::get_list_builder; use crate::datatypes::{DataType, ListChunked}; use crate::prelude::{IntoSeries, Series, *}; -fn reshape_fast_path(name: &str, s: &Series) -> Series { +fn reshape_fast_path(name: PlSmallStr, s: &Series) -> Series { let mut ca = match s.dtype() { #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { @@ -44,7 +44,7 @@ impl Series { .map(|arr| arr.values().clone()) .collect::>(); // Safety: guarded by the type system - unsafe { Series::from_chunks_and_dtype_unchecked(s.name(), chunks, dtype) } + unsafe { Series::from_chunks_and_dtype_unchecked(s.name().clone(), chunks, dtype) } .get_leaf_array() }, DataType::List(dtype) => { @@ -54,7 +54,7 @@ impl Series { .map(|arr| arr.values().clone()) .collect::>(); // Safety: guarded by the type system - unsafe { Series::from_chunks_and_dtype_unchecked(s.name(), chunks, dtype) } + unsafe { Series::from_chunks_and_dtype_unchecked(s.name().clone(), chunks, dtype) } .get_leaf_array() }, _ => s.clone(), @@ -83,7 +83,7 @@ impl Series { ) }; - let mut ca = ListChunked::with_chunk(s.name(), arr); + let mut ca = ListChunked::with_chunk(s.name().clone(), arr); unsafe { ca.to_logical(inner_type.clone()) }; ca.set_fast_explode(); Ok(ca) @@ -165,7 +165,7 @@ impl Series { } Ok(unsafe { Series::from_chunks_and_dtype_unchecked( - leaf_array.name(), + leaf_array.name().clone(), vec![prev_array], &prev_dtype, ) @@ -203,7 +203,7 @@ impl Series { if s_ref.len() == 0_usize { if (rows == -1 || rows == 0) && (cols == -1 || cols == 0 || cols == 1) { - let s = reshape_fast_path(s.name(), s_ref); + let s = reshape_fast_path(s.name().clone(), s_ref); return Ok(s); } else { polars_bail!(InvalidOperation: "cannot reshape len 0 into shape {:?}", dimensions,) @@ -222,7 +222,7 @@ impl Series { // Fast path, we can create a unit list so we only allocate offsets. if rows as usize == s_ref.len() && cols == 1 { - let s = reshape_fast_path(s.name(), s_ref); + let s = reshape_fast_path(s.name().clone(), s_ref); return Ok(s); } @@ -232,7 +232,7 @@ impl Series { ); let mut builder = - get_list_builder(s_ref.dtype(), s_ref.len(), rows as usize, s.name())?; + get_list_builder(s_ref.dtype(), s_ref.len(), rows as usize, s.name().clone())?; let mut offset = 0i64; for _ in 0..rows { @@ -256,9 +256,9 @@ mod test { #[test] fn test_to_list() -> PolarsResult<()> { - let s = Series::new("a", &[1, 2, 3]); + let s = Series::new("a".into(), &[1, 2, 3]); - let mut builder = get_list_builder(s.dtype(), s.len(), 1, s.name())?; + let mut builder = get_list_builder(s.dtype(), s.len(), 1, s.name().clone())?; builder.append_series(&s).unwrap(); let expected = builder.finish(); @@ -270,7 +270,7 @@ mod test { #[test] fn test_reshape() -> PolarsResult<()> { - let s = Series::new("a", &[1, 2, 3, 4]); + let s = Series::new("a".into(), &[1, 2, 3, 4]); for (dims, list_len) in [ (&[-1, 1], 4), diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 1aa606205dc3..21cc1bf37df0 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -61,7 +61,7 @@ pub(crate) mod private { #[cfg(feature = "object")] fn get_list_builder( &self, - _name: &str, + _name: PlSmallStr, _values_capacity: usize, _list_capacity: usize, ) -> Box { @@ -79,10 +79,6 @@ pub(crate) mod private { fn _set_flags(&mut self, flags: MetadataFlags); - fn explode_by_offsets(&self, _offsets: &[i64]) -> Series { - invalid_operation_panic!(explode_by_offsets, self) - } - unsafe fn equal_element( &self, _idx_self: usize, @@ -111,29 +107,29 @@ pub(crate) mod private { } #[cfg(feature = "algorithm_group_by")] unsafe fn agg_min(&self, groups: &GroupsProxy) -> Series { - Series::full_null(self._field().name(), groups.len(), self._dtype()) + Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } #[cfg(feature = "algorithm_group_by")] unsafe fn agg_max(&self, groups: &GroupsProxy) -> Series { - Series::full_null(self._field().name(), groups.len(), self._dtype()) + Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } /// If the [`DataType`] is one of `{Int8, UInt8, Int16, UInt16}` the `Series` is /// first cast to `Int64` to prevent overflow issues. #[cfg(feature = "algorithm_group_by")] unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Series { - Series::full_null(self._field().name(), groups.len(), self._dtype()) + Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } #[cfg(feature = "algorithm_group_by")] unsafe fn agg_std(&self, groups: &GroupsProxy, _ddof: u8) -> Series { - Series::full_null(self._field().name(), groups.len(), self._dtype()) + Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } #[cfg(feature = "algorithm_group_by")] unsafe fn agg_var(&self, groups: &GroupsProxy, _ddof: u8) -> Series { - Series::full_null(self._field().name(), groups.len(), self._dtype()) + Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } #[cfg(feature = "algorithm_group_by")] unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { - Series::full_null(self._field().name(), groups.len(), self._dtype()) + Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } fn subtract(&self, _rhs: &Series) -> PolarsResult { @@ -179,7 +175,7 @@ pub trait SeriesTrait: Send + Sync + private::PrivateSeries + private::PrivateSeriesNumeric { /// Rename the Series. - fn rename(&mut self, name: &str); + fn rename(&mut self, name: PlSmallStr); fn bitand(&self, _other: &Series) -> PolarsResult { polars_bail!(opq = bitand, self._dtype()); @@ -201,7 +197,7 @@ pub trait SeriesTrait: fn chunk_lengths(&self) -> ChunkLenIter; /// Name of series. - fn name(&self) -> &str; + fn name(&self) -> &PlSmallStr; /// Get field (used in schema) fn field(&self) -> Cow { @@ -326,7 +322,7 @@ pub trait SeriesTrait: /// /// ```rust /// use polars_core::prelude::*; - /// let s = Series::new("a", [0i32, 1, 8]); + /// let s = Series::new("a".into(), [0i32, 1, 8]); /// let s2 = s.new_from_index(2, 4); /// assert_eq!(Vec::from(s2.i32().unwrap()), &[Some(8), Some(8), Some(8), Some(8)]) /// ``` @@ -408,7 +404,7 @@ pub trait SeriesTrait: /// ```rust /// # use polars_core::prelude::*; /// fn example() -> PolarsResult<()> { - /// let s = Series::new("series", &[1, 2, 3]); + /// let s = Series::new("series".into(), &[1, 2, 3]); /// /// let shifted = s.shift(1); /// assert_eq!(Vec::from(shifted.i32()?), &[None, Some(1), Some(2)]); diff --git a/crates/polars-core/src/testing.rs b/crates/polars-core/src/testing.rs index 91d6b998c671..bf056b5f7769 100644 --- a/crates/polars-core/src/testing.rs +++ b/crates/polars-core/src/testing.rs @@ -181,26 +181,26 @@ mod test { #[test] fn test_series_equals() { - let a = Series::new("a", &[1_u32, 2, 3]); - let b = Series::new("a", &[1_u32, 2, 3]); + let a = Series::new("a".into(), &[1_u32, 2, 3]); + let b = Series::new("a".into(), &[1_u32, 2, 3]); assert!(a.equals(&b)); - let s = Series::new("foo", &[None, Some(1i64)]); + let s = Series::new("foo".into(), &[None, Some(1i64)]); assert!(s.equals_missing(&s)); } #[test] fn test_series_dtype_not_equal() { - let s_i32 = Series::new("a", &[1_i32, 2_i32]); - let s_i64 = Series::new("a", &[1_i64, 2_i64]); + let s_i32 = Series::new("a".into(), &[1_i32, 2_i32]); + let s_i64 = Series::new("a".into(), &[1_i64, 2_i64]); assert!(s_i32.dtype() != s_i64.dtype()); assert!(s_i32.equals(&s_i64)); } #[test] fn test_df_equal() { - let a = Series::new("a", [1, 2, 3].as_ref()); - let b = Series::new("b", [1, 2, 3].as_ref()); + let a = Series::new("a".into(), [1, 2, 3].as_ref()); + let b = Series::new("b".into(), [1, 2, 3].as_ref()); let df1 = DataFrame::new(vec![a, b]).unwrap(); assert!(df1.equals(&df1)) diff --git a/crates/polars-core/src/tests.rs b/crates/polars-core/src/tests.rs index 12e2701bb836..e8a8111225b7 100644 --- a/crates/polars-core/src/tests.rs +++ b/crates/polars-core/src/tests.rs @@ -4,9 +4,9 @@ use crate::prelude::*; fn test_initial_empty_sort() -> PolarsResult<()> { // https://github.com/pola-rs/polars/issues/1396 let data = vec![1.3; 42]; - let mut series = Series::new("data", Vec::::new()); - let series2 = Series::new("data2", data.clone()); - let series3 = Series::new("data3", data); + let mut series = Series::new("data".into(), Vec::::new()); + let series2 = Series::new("data2".into(), data.clone()); + let series3 = Series::new("data3".into(), data); let df = DataFrame::new(vec![series2, series3])?; for column in df.get_columns().iter() { diff --git a/crates/polars-core/src/utils/flatten.rs b/crates/polars-core/src/utils/flatten.rs index a3cd58c79c92..52b1c69ea6d9 100644 --- a/crates/polars-core/src/utils/flatten.rs +++ b/crates/polars-core/src/utils/flatten.rs @@ -12,7 +12,7 @@ pub fn flatten_df_iter(df: &DataFrame) -> impl Iterator + '_ { // SAFETY: // datatypes are correct let mut out = unsafe { - Series::from_chunks_and_dtype_unchecked(s.name(), vec![arr], s.dtype()) + Series::from_chunks_and_dtype_unchecked(s.name().clone(), vec![arr], s.dtype()) }; out.set_sorted_flag(s.is_sorted_flag()); out @@ -33,7 +33,9 @@ pub fn flatten_series(s: &Series) -> Vec { unsafe { s.chunks() .iter() - .map(|arr| Series::from_chunks_and_dtype_unchecked(name, vec![arr.clone()], dtype)) + .map(|arr| { + Series::from_chunks_and_dtype_unchecked(name.clone(), vec![arr.clone()], dtype) + }) .collect() } } diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index 4a078ae0f2c6..c5dcd892b9b0 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -18,7 +18,6 @@ use num_traits::{One, Zero}; use rayon::prelude::*; pub use schema::*; pub use series::*; -use smartstring::alias::String as SmartString; pub use supertype::*; pub use {arrow, rayon}; @@ -161,7 +160,7 @@ impl Container for ChunkedArray { fn iter_chunks(&self) -> impl Iterator { self.downcast_iter() - .map(|arr| Self::with_chunk(self.name(), arr.clone())) + .map(|arr| Self::with_chunk(self.name().clone(), arr.clone())) } fn n_chunks(&self) -> usize { @@ -685,7 +684,7 @@ macro_rules! apply_method_physical_numeric { macro_rules! df { ($($col_name:expr => $slice:expr), + $(,)?) => { $crate::prelude::DataFrame::new(vec![ - $(<$crate::prelude::Series as $crate::prelude::NamedFrom::<_, _>>::new($col_name, $slice),)+ + $(<$crate::prelude::Series as $crate::prelude::NamedFrom::<_, _>>::new($col_name.into(), $slice),)+ ]) } } @@ -996,42 +995,18 @@ where combine_validities_and(left_validity.as_ref(), right_validity.as_ref()) } +/// Convenience for `x.into_iter().map(Into::into).collect()` using an `into_vec()` function. pub trait IntoVec { fn into_vec(self) -> Vec; } -pub trait Arg {} -impl Arg for bool {} - -impl IntoVec for bool { - fn into_vec(self) -> Vec { - vec![self] - } -} - -impl IntoVec for Vec { - fn into_vec(self) -> Self { - self - } -} - -impl IntoVec for I -where - I: IntoIterator, - S: AsRef, -{ - fn into_vec(self) -> Vec { - self.into_iter().map(|s| s.as_ref().to_string()).collect() - } -} - -impl IntoVec for I +impl IntoVec for I where I: IntoIterator, - S: AsRef, + S: Into, { - fn into_vec(self) -> Vec { - self.into_iter().map(|s| s.as_ref().into()).collect() + fn into_vec(self) -> Vec { + self.into_iter().map(|s| s.into()).collect() } } @@ -1204,7 +1179,7 @@ mod test { #[test] fn test_split() { - let ca: Int32Chunked = (0..10).collect_ca("a"); + let ca: Int32Chunked = (0..10).collect_ca("a".into()); let out = split(&ca, 3); assert_eq!(out[0].len(), 3); @@ -1214,9 +1189,9 @@ mod test { #[test] fn test_align_chunks() -> PolarsResult<()> { - let a = Int32Chunked::new("", &[1, 2, 3, 4]); - let mut b = Int32Chunked::new("", &[1]); - let b2 = Int32Chunked::new("", &[2, 3, 4]); + let a = Int32Chunked::new(PlSmallStr::const_default(), &[1, 2, 3, 4]); + let mut b = Int32Chunked::new(PlSmallStr::const_default(), &[1]); + let b2 = Int32Chunked::new(PlSmallStr::const_default(), &[2, 3, 4]); b.append(&b2)?; let (a, b) = align_chunks_binary(&a, &b); @@ -1225,8 +1200,8 @@ mod test { b.chunk_lengths().collect::>() ); - let a = Int32Chunked::new("", &[1, 2, 3, 4]); - let mut b = Int32Chunked::new("", &[1]); + let a = Int32Chunked::new(PlSmallStr::const_default(), &[1, 2, 3, 4]); + let mut b = Int32Chunked::new(PlSmallStr::const_default(), &[1]); let b1 = b.clone(); b.append(&b1)?; b.append(&b1)?; diff --git a/crates/polars-core/src/utils/schema.rs b/crates/polars-core/src/utils/schema.rs index c528f3160624..558a0ea8f1b8 100644 --- a/crates/polars-core/src/utils/schema.rs +++ b/crates/polars-core/src/utils/schema.rs @@ -1,3 +1,5 @@ +use polars_utils::format_pl_smallstr; + use crate::prelude::*; /// Convert a collection of [`DataType`] into a schema. @@ -12,6 +14,6 @@ where dtypes .into_iter() .enumerate() - .map(|(i, dtype)| Field::new(format!("column_{i}").as_ref(), dtype)) + .map(|(i, dtype)| Field::new(format_pl_smallstr!("column_{i}"), dtype)) .collect() } diff --git a/crates/polars-core/src/utils/series.rs b/crates/polars-core/src/utils/series.rs index fb9d674100e1..06848ac8e577 100644 --- a/crates/polars-core/src/utils/series.rs +++ b/crates/polars-core/src/utils/series.rs @@ -9,7 +9,7 @@ pub fn with_unstable_series(dtype: &DataType, f: F) -> T where F: Fn(&mut AmortSeries) -> T, { - let container = Series::full_null("", 0, dtype); + let container = Series::full_null(PlSmallStr::const_default(), 0, dtype); let mut us = AmortSeries::new(Rc::new(container)); f(&mut us) diff --git a/crates/polars-core/src/utils/supertype.rs b/crates/polars-core/src/utils/supertype.rs index 44d4578bf57e..027e85886793 100644 --- a/crates/polars-core/src/utils/supertype.rs +++ b/crates/polars-core/src/utils/supertype.rs @@ -369,7 +369,7 @@ pub fn get_supertype_with_options( let mut new_fields = Vec::with_capacity(fields_a.len()); for a in fields_a { let st = get_supertype(&a.dtype, rhs)?; - new_fields.push(Field::new(&a.name, st)) + new_fields.push(Field::new(a.name.clone(), st)) } Some(Struct(new_fields)) } @@ -426,7 +426,7 @@ fn union_struct_fields(fields_a: &[Field], fields_b: &[Field]) -> Option>(); Some(DataType::Struct(new_fields)) } @@ -442,7 +442,7 @@ fn super_type_structs(fields_a: &[Field], fields_b: &[Field]) -> Option { if MetadataEnv::experimental_enabled() { if let Some(sc) = s.get_metadata().and_then(|v| v.min_value()) { - return Ok(sc.into_series(s.name())); + return Ok(sc.into_series(s.name().clone())); } } match s.is_sorted_flag() { IsSorted::Ascending | IsSorted::Descending => { - s.min_reduce().map(|sc| sc.into_series(s.name())) + s.min_reduce().map(|sc| sc.into_series(s.name().clone())) }, IsSorted::Not => parallel_op_series( - |s| s.min_reduce().map(|sc| sc.into_series(s.name())), + |s| s.min_reduce().map(|sc| sc.into_series(s.name().clone())), s, allow_threading, ), @@ -89,7 +89,7 @@ impl PhysicalExpr for AggregationExpr { |s| { Ok(polars_ops::prelude::nan_propagating_aggregate::nan_min_s( &s, - s.name(), + s.name().clone(), )) }, s, @@ -102,16 +102,16 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::Max => { if MetadataEnv::experimental_enabled() { if let Some(sc) = s.get_metadata().and_then(|v| v.max_value()) { - return Ok(sc.into_series(s.name())); + return Ok(sc.into_series(s.name().clone())); } } match s.is_sorted_flag() { IsSorted::Ascending | IsSorted::Descending => { - s.max_reduce().map(|sc| sc.into_series(s.name())) + s.max_reduce().map(|sc| sc.into_series(s.name().clone())) }, IsSorted::Not => parallel_op_series( - |s| s.max_reduce().map(|sc| sc.into_series(s.name())), + |s| s.max_reduce().map(|sc| sc.into_series(s.name().clone())), s, allow_threading, ), @@ -122,7 +122,7 @@ impl PhysicalExpr for AggregationExpr { |s| { Ok(polars_ops::prelude::nan_propagating_aggregate::nan_max_s( &s, - s.name(), + s.name().clone(), )) }, s, @@ -132,20 +132,20 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::NanMax => { panic!("activate 'propagate_nans' feature") }, - GroupByMethod::Median => s.median_reduce().map(|sc| sc.into_series(s.name())), - GroupByMethod::Mean => Ok(s.mean_reduce().into_series(s.name())), + GroupByMethod::Median => s.median_reduce().map(|sc| sc.into_series(s.name().clone())), + GroupByMethod::Mean => Ok(s.mean_reduce().into_series(s.name().clone())), GroupByMethod::First => Ok(if s.is_empty() { - Series::full_null(s.name(), 1, s.dtype()) + Series::full_null(s.name().clone(), 1, s.dtype()) } else { s.head(Some(1)) }), GroupByMethod::Last => Ok(if s.is_empty() { - Series::full_null(s.name(), 1, s.dtype()) + Series::full_null(s.name().clone(), 1, s.dtype()) } else { s.tail(Some(1)) }), GroupByMethod::Sum => parallel_op_series( - |s| s.sum_reduce().map(|sc| sc.into_series(s.name())), + |s| s.sum_reduce().map(|sc| sc.into_series(s.name().clone())), s, allow_threading, ), @@ -154,21 +154,26 @@ impl PhysicalExpr for AggregationExpr { if MetadataEnv::experimental_enabled() { if let Some(count) = s.get_metadata().and_then(|v| v.distinct_count()) { let count = count + IdxSize::from(s.null_count() > 0); - return Ok(IdxCa::from_slice(s.name(), &[count]).into_series()); + return Ok(IdxCa::from_slice(s.name().clone(), &[count]).into_series()); } } - s.n_unique() - .map(|count| IdxCa::from_slice(s.name(), &[count as IdxSize]).into_series()) + s.n_unique().map(|count| { + IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_series() + }) }, GroupByMethod::Count { include_nulls } => { let count = s.len() - s.null_count() * !include_nulls as usize; - Ok(IdxCa::from_slice(s.name(), &[count as IdxSize]).into_series()) + Ok(IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_series()) }, GroupByMethod::Implode => s.implode().map(|ca| ca.into_series()), - GroupByMethod::Std(ddof) => s.std_reduce(ddof).map(|sc| sc.into_series(s.name())), - GroupByMethod::Var(ddof) => s.var_reduce(ddof).map(|sc| sc.into_series(s.name())), + GroupByMethod::Std(ddof) => s + .std_reduce(ddof) + .map(|sc| sc.into_series(s.name().clone())), + GroupByMethod::Var(ddof) => s + .var_reduce(ddof) + .map(|sc| sc.into_series(s.name().clone())), GroupByMethod::Quantile(_, _) => unimplemented!(), } } @@ -181,7 +186,7 @@ impl PhysicalExpr for AggregationExpr { ) -> PolarsResult> { let mut ac = self.input.evaluate_on_groups(df, groups, state)?; // don't change names by aggregations as is done in polars-core - let keep_name = ac.series().name().to_string(); + let keep_name = ac.series().name().clone(); polars_ensure!(!matches!(ac.agg_state(), AggState::Literal(_)), ComputeError: "cannot aggregate a literal"); if let AggregatedScalar(_) = ac.agg_state() { @@ -200,27 +205,27 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::Min => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_min(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Max => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_max(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Median => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_median(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Mean => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_mean(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Sum => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_sum(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Count { include_nulls } => { if include_nulls || ac.series().null_count() == 0 { @@ -262,7 +267,7 @@ impl PhysicalExpr for AggregationExpr { counts.into_inner() }, }; - s.rename(&keep_name); + s.rename(keep_name); AggregatedScalar(s.into_series()) }, UpdateGroups::WithGroupsLen => { @@ -270,13 +275,13 @@ impl PhysicalExpr for AggregationExpr { // we can just get the attribute, because we only need the length, // not the correct order let mut ca = ac.groups.group_count(); - ca.rename(&keep_name); + ca.rename(keep_name); AggregatedScalar(ca.into_series()) }, // materialize groups _ => { let mut ca = ac.groups().group_count(); - ca.rename(&keep_name); + ca.rename(keep_name); AggregatedScalar(ca.into_series()) }, } @@ -285,7 +290,7 @@ impl PhysicalExpr for AggregationExpr { match ac.agg_state() { AggState::Literal(s) | AggState::AggregatedScalar(s) => { AggregatedScalar(Series::new( - &keep_name, + keep_name, [(s.len() as IdxSize - s.null_count() as IdxSize)], )) }, @@ -298,13 +303,13 @@ impl PhysicalExpr for AggregationExpr { .map(|s| s.len() as IdxSize - s.null_count() as IdxSize) }) .collect(); - AggregatedScalar(rename_series(out.into_series(), &keep_name)) + AggregatedScalar(rename_series(out.into_series(), keep_name)) }, AggState::NotAggregated(s) => { let s = s.clone(); let groups = ac.groups(); let out: IdxCa = if matches!(s.dtype(), &DataType::Null) { - IdxCa::full(s.name(), 0, groups.len()) + IdxCa::full(s.name().clone(), 0, groups.len()) } else { match groups.as_ref() { GroupsProxy::Idx(idx) => { @@ -322,9 +327,7 @@ impl PhysicalExpr for AggregationExpr { }); count }) - .collect_ca_trusted_with_dtype( - &keep_name, IDX_DTYPE, - ) + .collect_ca_trusted_with_dtype(keep_name, IDX_DTYPE) }, GroupsProxy::Slice { groups, .. } => { // Slice and use computed null count @@ -338,9 +341,7 @@ impl PhysicalExpr for AggregationExpr { .null_count() as IdxSize }) - .collect_ca_trusted_with_dtype( - &keep_name, IDX_DTYPE, - ) + .collect_ca_trusted_with_dtype(keep_name, IDX_DTYPE) }, } }; @@ -352,17 +353,17 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::First => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_first(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Last => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_last(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::NUnique => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_n_unique(&groups); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Implode => { // if the aggregation is already @@ -380,22 +381,22 @@ impl PhysicalExpr for AggregationExpr { agg.as_list().into_series() }, }; - AggregatedList(rename_series(s, &keep_name)) + AggregatedList(rename_series(s, keep_name)) }, GroupByMethod::Groups => { let mut column: ListChunked = ac.groups().as_list_chunked(); - column.rename(&keep_name); + column.rename(keep_name); AggregatedScalar(column.into_series()) }, GroupByMethod::Std(ddof) => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_std(&groups, ddof); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Var(ddof) => { let (s, groups) = ac.get_final_aggregation(); let agg_s = s.agg_var(&groups, ddof); - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) }, GroupByMethod::Quantile(_, _) => { // implemented explicitly in AggQuantile struct @@ -410,7 +411,7 @@ impl PhysicalExpr for AggregationExpr { } else { s.agg_min(&groups) }; - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) } #[cfg(not(feature = "propagate_nans"))] { @@ -426,7 +427,7 @@ impl PhysicalExpr for AggregationExpr { } else { s.agg_max(&groups) }; - AggregatedScalar(rename_series(agg_s, &keep_name)) + AggregatedScalar(rename_series(agg_s, keep_name)) } #[cfg(not(feature = "propagate_nans"))] { @@ -455,7 +456,7 @@ impl PhysicalExpr for AggregationExpr { } } -fn rename_series(mut s: Series, name: &str) -> Series { +fn rename_series(mut s: Series, name: PlSmallStr) -> Series { s.rename(name); s } @@ -476,7 +477,7 @@ impl PartitionedAggregation for AggregationExpr { match self.agg_type.groupby { #[cfg(feature = "dtype-struct")] GroupByMethod::Mean => { - let new_name = series.name().to_string(); + let new_name = series.name().clone(); // ensure we don't overflow // the all 8 and 16 bits integers are already upcasted to int16 on `agg_sum` @@ -486,7 +487,7 @@ impl PartitionedAggregation for AggregationExpr { } else { series.agg_sum(groups) }; - agg_s.rename(&new_name); + agg_s.rename(new_name.clone()); if !agg_s.dtype().is_numeric() { Ok(agg_s) @@ -496,48 +497,48 @@ impl PartitionedAggregation for AggregationExpr { _ => agg_s.cast(&DataType::Float64).unwrap(), }; let mut count_s = series.agg_valid_count(groups); - count_s.rename("__POLARS_COUNT"); - Ok(StructChunked::from_series(&new_name, &[agg_s, count_s]) + count_s.rename(PlSmallStr::from_static("__POLARS_COUNT")); + Ok(StructChunked::from_series(new_name, &[agg_s, count_s]) .unwrap() .into_series()) } }, GroupByMethod::Implode => { - let new_name = series.name(); + let new_name = series.name().clone(); let mut agg = series.agg_list(groups); agg.rename(new_name); Ok(agg) }, GroupByMethod::First => { let mut agg = series.agg_first(groups); - agg.rename(series.name()); + agg.rename(series.name().clone()); Ok(agg) }, GroupByMethod::Last => { let mut agg = series.agg_last(groups); - agg.rename(series.name()); + agg.rename(series.name().clone()); Ok(agg) }, GroupByMethod::Max => { let mut agg = series.agg_max(groups); - agg.rename(series.name()); + agg.rename(series.name().clone()); Ok(agg) }, GroupByMethod::Min => { let mut agg = series.agg_min(groups); - agg.rename(series.name()); + agg.rename(series.name().clone()); Ok(agg) }, GroupByMethod::Sum => { let mut agg = series.agg_sum(groups); - agg.rename(series.name()); + agg.rename(series.name().clone()); Ok(agg) }, GroupByMethod::Count { include_nulls: true, } => { let mut ca = groups.group_count(); - ca.rename(series.name()); + ca.rename(series.name().clone()); Ok(ca.into_series()) }, _ => { @@ -559,12 +560,12 @@ impl PartitionedAggregation for AggregationExpr { } | GroupByMethod::Sum => { let mut agg = unsafe { partitioned.agg_sum(groups) }; - agg.rename(partitioned.name()); + agg.rename(partitioned.name().clone()); Ok(agg) }, #[cfg(feature = "dtype-struct")] GroupByMethod::Mean => { - let new_name = partitioned.name(); + let new_name = partitioned.name().clone(); match partitioned.dtype() { DataType::Struct(_) => { let ca = partitioned.struct_().unwrap(); @@ -587,7 +588,7 @@ impl PartitionedAggregation for AggregationExpr { // the groups are scattered over multiple groups/sub dataframes. // we now must collect them into a single group let ca = partitioned.list().unwrap(); - let new_name = partitioned.name().to_string(); + let new_name = partitioned.name().clone(); let mut values = Vec::with_capacity(groups.len()); let mut can_fast_explode = true; @@ -639,7 +640,7 @@ impl PartitionedAggregation for AggregationExpr { values, None, ); - let mut ca = ListChunked::with_chunk(&new_name, arr); + let mut ca = ListChunked::with_chunk(new_name, arr); if can_fast_explode { ca.set_fast_explode() } @@ -647,22 +648,22 @@ impl PartitionedAggregation for AggregationExpr { }, GroupByMethod::First => { let mut agg = unsafe { partitioned.agg_first(groups) }; - agg.rename(partitioned.name()); + agg.rename(partitioned.name().clone()); Ok(agg) }, GroupByMethod::Last => { let mut agg = unsafe { partitioned.agg_last(groups) }; - agg.rename(partitioned.name()); + agg.rename(partitioned.name().clone()); Ok(agg) }, GroupByMethod::Max => { let mut agg = unsafe { partitioned.agg_max(groups) }; - agg.rename(partitioned.name()); + agg.rename(partitioned.name().clone()); Ok(agg) }, GroupByMethod::Min => { let mut agg = unsafe { partitioned.agg_min(groups) }; - agg.rename(partitioned.name()); + agg.rename(partitioned.name().clone()); Ok(agg) }, _ => unimplemented!(), @@ -709,7 +710,7 @@ impl PhysicalExpr for AggQuantileExpr { let quantile = self.get_quantile(df, state)?; input .quantile_reduce(quantile, self.interpol) - .map(|sc| sc.into_series(input.name())) + .map(|sc| sc.into_series(input.name().clone())) } #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( @@ -720,7 +721,7 @@ impl PhysicalExpr for AggQuantileExpr { ) -> PolarsResult> { let mut ac = self.input.evaluate_on_groups(df, groups, state)?; // don't change names by aggregations as is done in polars-core - let keep_name = ac.series().name().to_string(); + let keep_name = ac.series().name().clone(); let quantile = self.get_quantile(df, state)?; @@ -731,7 +732,7 @@ impl PhysicalExpr for AggQuantileExpr { .into_owned() .agg_quantile(ac.groups(), quantile, self.interpol) }; - agg.rename(&keep_name); + agg.rename(keep_name); Ok(AggregationContext::from_agg_state( AggregatedScalar(agg), Cow::Borrowed(groups), diff --git a/crates/polars-expr/src/expressions/alias.rs b/crates/polars-expr/src/expressions/alias.rs index fa755fd2b233..8298bbf06ee8 100644 --- a/crates/polars-expr/src/expressions/alias.rs +++ b/crates/polars-expr/src/expressions/alias.rs @@ -5,12 +5,12 @@ use crate::expressions::{AggregationContext, PartitionedAggregation, PhysicalExp pub struct AliasExpr { pub(crate) physical_expr: Arc, - pub(crate) name: Arc, + pub(crate) name: PlSmallStr, expr: Expr, } impl AliasExpr { - pub fn new(physical_expr: Arc, name: Arc, expr: Expr) -> Self { + pub fn new(physical_expr: Arc, name: PlSmallStr, expr: Expr) -> Self { Self { physical_expr, name, @@ -19,7 +19,7 @@ impl AliasExpr { } fn finish(&self, input: Series) -> Series { - input.with_name(&self.name) + input.with_name(self.name.clone()) } } @@ -54,7 +54,7 @@ impl PhysicalExpr for AliasExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { Ok(Field::new( - &self.name, + self.name.clone(), self.physical_expr .to_field(input_schema)? .data_type() @@ -76,7 +76,7 @@ impl PartitionedAggregation for AliasExpr { ) -> PolarsResult { let agg = self.physical_expr.as_partitioned_aggregator().unwrap(); let s = agg.evaluate_partitioned(df, groups, state)?; - Ok(s.with_name(&self.name)) + Ok(s.with_name(self.name.clone())) } fn finalize( @@ -87,6 +87,6 @@ impl PartitionedAggregation for AliasExpr { ) -> PolarsResult { let agg = self.physical_expr.as_partitioned_aggregator().unwrap(); let s = agg.finalize(partitioned, groups, state)?; - Ok(s.with_name(&self.name)) + Ok(s.with_name(self.name.clone())) } } diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index 802e130d15f2..100f193b4db5 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -130,7 +130,11 @@ impl ApplyExpr { Ok(out) } else { let field = self.to_field(self.input_schema.as_ref().unwrap()).unwrap(); - Ok(Series::full_null(field.name(), 1, field.data_type())) + Ok(Series::full_null( + field.name().clone(), + 1, + field.data_type(), + )) } } fn apply_single_group_aware<'a>( @@ -145,17 +149,17 @@ impl ApplyExpr { ComputeError: "cannot aggregate, the column is already aggregated", ); - let name = s.name().to_string(); + let name = s.name().clone(); let agg = ac.aggregated(); // Collection of empty list leads to a null dtype. See: #3687. if agg.len() == 0 { // Create input for the function to determine the output dtype, see #3946. let agg = agg.list().unwrap(); let input_dtype = agg.inner_dtype(); - let input = Series::full_null("", 0, input_dtype); + let input = Series::full_null(PlSmallStr::const_default(), 0, input_dtype); let output = self.eval_and_flatten(&mut [input])?; - let ca = ListChunked::full(&name, &output, 0); + let ca = ListChunked::full(name, &output, 0); return self.finish_apply_groups(ac, ca); } @@ -163,7 +167,7 @@ impl ApplyExpr { None => Ok(None), Some(mut s) => { if self.pass_name_to_apply { - s.rename(&name); + s.rename(name.clone()); } self.function.call_udf(&mut [s]) }, @@ -181,7 +185,7 @@ impl ApplyExpr { if let Some(dtype) = dtype { // TODO! uncomment this line and remove debug_assertion after a while. // POOL.install(|| { - // iter.collect_ca_with_dtype::>("", DataType::List(Box::new(dtype))) + // iter.collect_ca_with_dtype::>(PlSmallStr::const_default(), DataType::List(Box::new(dtype))) // })? let out: ListChunked = POOL.install(|| iter.collect::>())?; @@ -199,7 +203,7 @@ impl ApplyExpr { .collect::>()? }; - self.finish_apply_groups(ac, ca.with_name(&name)) + self.finish_apply_groups(ac, ca.with_name(name)) } /// Apply elementwise e.g. ignore the group/list indices. @@ -254,14 +258,14 @@ impl ApplyExpr { ac.with_update_groups(UpdateGroups::No); let agg_state = if self.returns_scalar { - AggState::AggregatedScalar(Series::new_empty(field.name(), &field.dtype)) + AggState::AggregatedScalar(Series::new_empty(field.name().clone(), &field.dtype)) } else { match self.collect_groups { ApplyOptions::ElementWise | ApplyOptions::ApplyList => ac .agg_state() - .map(|_| Series::new_empty(field.name(), &field.dtype)), + .map(|_| Series::new_empty(field.name().clone(), &field.dtype)), ApplyOptions::GroupWise => AggState::AggregatedList(Series::new_empty( - field.name(), + field.name().clone(), &DataType::List(Box::new(field.dtype.clone())), )), } @@ -283,7 +287,7 @@ impl ApplyExpr { self.function.call_udf(&mut container) }) .collect::>()? - .with_name(&field.name); + .with_name(field.name.clone()); drop(iters); @@ -330,8 +334,8 @@ impl PhysicalExpr for ApplyExpr { if self.allow_rename { self.eval_and_flatten(&mut inputs) } else { - let in_name = inputs[0].name().to_string(); - Ok(self.eval_and_flatten(&mut inputs)?.with_name(&in_name)) + let in_name = inputs[0].name().clone(); + Ok(self.eval_and_flatten(&mut inputs)?.with_name(in_name)) } } @@ -577,7 +581,7 @@ impl ApplyExpr { #[cfg(feature = "is_between")] FunctionExpr::Boolean(BooleanFunction::IsBetween { closed }) => { let should_read = || -> Option { - let root: Arc = expr_to_leaf_column_name(&input[0]).ok()?; + let root: PlSmallStr = expr_to_leaf_column_name(&input[0]).ok()?; let Expr::Literal(left) = &input[1] else { return None; }; @@ -592,11 +596,20 @@ impl ApplyExpr { let (left, left_dtype) = (left.to_any_value()?, left.get_datatype()); let (right, right_dtype) = (right.to_any_value()?, right.get_datatype()); - let left = - Series::from_any_values_and_dtype("", &[left], &left_dtype, false).ok()?; - let right = - Series::from_any_values_and_dtype("", &[right], &right_dtype, false) - .ok()?; + let left = Series::from_any_values_and_dtype( + PlSmallStr::const_default(), + &[left], + &left_dtype, + false, + ) + .ok()?; + let right = Series::from_any_values_and_dtype( + PlSmallStr::const_default(), + &[right], + &right_dtype, + false, + ) + .ok()?; // don't read the row_group anyways as // the condition will evaluate to false. @@ -649,8 +662,8 @@ impl PartitionedAggregation for ApplyExpr { if self.allow_rename { self.eval_and_flatten(&mut [s]) } else { - let in_name = s.name().to_string(); - Ok(self.eval_and_flatten(&mut [s])?.with_name(&in_name)) + let in_name = s.name().clone(); + Ok(self.eval_and_flatten(&mut [s])?.with_name(in_name)) } } diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index 55caf00ad69a..d9ebf38070e3 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -128,7 +128,7 @@ impl BinaryExpr { mut ac_l: AggregationContext<'a>, mut ac_r: AggregationContext<'a>, ) -> PolarsResult> { - let name = ac_l.series().name().to_string(); + let name = ac_l.series().name().clone(); ac_l.groups(); ac_r.groups(); polars_ensure!(ac_l.groups.len() == ac_r.groups.len(), ComputeError: "lhs and rhs should have same group length"); @@ -139,7 +139,7 @@ impl BinaryExpr { let res_s = if res_s.len() == 1 { res_s.new_from_index(0, ac_l.groups.len()) } else { - ListChunked::full(&name, &res_s, ac_l.groups.len()).into_series() + ListChunked::full(name, &res_s, ac_l.groups.len()).into_series() }; ac_l.with_series(res_s, true, Some(&self.expr))?; Ok(ac_l) @@ -150,14 +150,14 @@ impl BinaryExpr { mut ac_l: AggregationContext<'a>, mut ac_r: AggregationContext<'a>, ) -> PolarsResult> { - let name = ac_l.series().name().to_string(); + let name = ac_l.series().name().clone(); let ca = ac_l .iter_groups(false) .zip(ac_r.iter_groups(false)) .map(|(l, r)| Some(apply_operator(l?.as_ref(), r?.as_ref(), self.op))) .map(|opt_res| opt_res.transpose()) .collect::>()? - .with_name(&name); + .with_name(name); ac_l.with_update_groups(UpdateGroups::WithSeriesLen); ac_l.with_agg_state(AggState::AggregatedList(ca.into_series())); diff --git a/crates/polars-expr/src/expressions/column.rs b/crates/polars-expr/src/expressions/column.rs index cac4b52ddb11..d0b0b9913935 100644 --- a/crates/polars-expr/src/expressions/column.rs +++ b/crates/polars-expr/src/expressions/column.rs @@ -7,13 +7,13 @@ use super::*; use crate::expressions::{AggregationContext, PartitionedAggregation, PhysicalExpr}; pub struct ColumnExpr { - name: Arc, + name: PlSmallStr, expr: Expr, schema: Option, } impl ColumnExpr { - pub fn new(name: Arc, expr: Expr, schema: Option) -> Self { + pub fn new(name: PlSmallStr, expr: Expr, schema: Option) -> Self { Self { name, expr, schema } } } diff --git a/crates/polars-expr/src/expressions/count.rs b/crates/polars-expr/src/expressions/count.rs index 246e939e3ef3..2d8fbeb6a2d2 100644 --- a/crates/polars-expr/src/expressions/count.rs +++ b/crates/polars-expr/src/expressions/count.rs @@ -22,7 +22,10 @@ impl PhysicalExpr for CountExpr { } fn evaluate(&self, df: &DataFrame, _state: &ExecutionState) -> PolarsResult { - Ok(Series::new("len", [df.height() as IdxSize])) + Ok(Series::new( + PlSmallStr::from_static("len"), + [df.height() as IdxSize], + )) } fn evaluate_on_groups<'a>( @@ -31,13 +34,13 @@ impl PhysicalExpr for CountExpr { groups: &'a GroupsProxy, _state: &ExecutionState, ) -> PolarsResult> { - let ca = groups.group_count().with_name(LEN); + let ca = groups.group_count().with_name(PlSmallStr::from_static(LEN)); let s = ca.into_series(); Ok(AggregationContext::new(s, Cow::Borrowed(groups), true)) } fn to_field(&self, _input_schema: &Schema) -> PolarsResult { - Ok(Field::new(LEN, IDX_DTYPE)) + Ok(Field::new(PlSmallStr::from_static(LEN), IDX_DTYPE)) } fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { @@ -67,6 +70,6 @@ impl PartitionedAggregation for CountExpr { ) -> PolarsResult { // SAFETY: groups are in bounds. let agg = unsafe { partitioned.agg_sum(groups) }; - Ok(agg.with_name(LEN)) + Ok(agg.with_name(PlSmallStr::from_static(LEN))) } } diff --git a/crates/polars-expr/src/expressions/filter.rs b/crates/polars-expr/src/expressions/filter.rs index db9ee0cf120e..4e02b38ae4b7 100644 --- a/crates/polars-expr/src/expressions/filter.rs +++ b/crates/polars-expr/src/expressions/filter.rs @@ -58,7 +58,7 @@ impl PhysicalExpr for FilterExpr { let ca = s.list()?; let out = if ca.is_empty() { // return an empty list if ca is empty. - ListChunked::full_null_with_dtype(ca.name(), 0, ca.inner_dtype()) + ListChunked::full_null_with_dtype(ca.name().clone(), 0, ca.inner_dtype()) } else { { ca.amortized_iter() @@ -70,7 +70,7 @@ impl PhysicalExpr for FilterExpr { _ => Ok(None), }) .collect::>()? - .with_name(s.name()) + .with_name(s.name().clone()) } }; ac_s.with_series(out.into_series(), true, Some(&self.expr))?; diff --git a/crates/polars-expr/src/expressions/gather.rs b/crates/polars-expr/src/expressions/gather.rs index 951833717a33..c82bedee986b 100644 --- a/crates/polars-expr/src/expressions/gather.rs +++ b/crates/polars-expr/src/expressions/gather.rs @@ -81,7 +81,7 @@ impl PhysicalExpr for GatherExpr { .map(|(s, idx)| Some(s?.as_ref().take(idx?.as_ref().idx().unwrap()))) .map(|opt_res| opt_res.transpose()) .collect::>()? - .with_name(ac.series().name()) + .with_name(ac.series().name().clone()) }; ac.with_series(taken.into_series(), true, Some(&self.expr))?; @@ -250,7 +250,7 @@ impl GatherExpr { &ac.dtype(), idx.series().len(), groups.len(), - ac.series().name(), + ac.series().name().clone(), )?; let iter = ac.iter_groups(false).zip(idx.iter_groups(false)); diff --git a/crates/polars-expr/src/expressions/group_iter.rs b/crates/polars-expr/src/expressions/group_iter.rs index 26c68fdae3d2..a53738facf7e 100644 --- a/crates/polars-expr/src/expressions/group_iter.rs +++ b/crates/polars-expr/src/expressions/group_iter.rs @@ -13,7 +13,11 @@ impl<'a> AggregationContext<'a> { AggState::Literal(_) => { self.groups(); let s = self.series().rechunk(); - let name = if keep_names { s.name() } else { "" }; + let name = if keep_names { + s.name().clone() + } else { + PlSmallStr::const_default() + }; // SAFETY: dtype is correct unsafe { Box::new(LitIter::new( @@ -27,7 +31,11 @@ impl<'a> AggregationContext<'a> { AggState::AggregatedScalar(_) => { self.groups(); let s = self.series(); - let name = if keep_names { s.name() } else { "" }; + let name = if keep_names { + s.name().clone() + } else { + PlSmallStr::const_default() + }; // SAFETY: dtype is correct unsafe { Box::new(FlatIter::new( @@ -41,7 +49,11 @@ impl<'a> AggregationContext<'a> { AggState::AggregatedList(_) => { let s = self.series(); let list = s.list().unwrap(); - let name = if keep_names { s.name() } else { "" }; + let name = if keep_names { + s.name().clone() + } else { + PlSmallStr::const_default() + }; Box::new(list.amortized_iter_with_name(name)) }, AggState::NotAggregated(_) => { @@ -49,7 +61,11 @@ impl<'a> AggregationContext<'a> { let _ = self.aggregated(); let s = self.series(); let list = s.list().unwrap(); - let name = if keep_names { s.name() } else { "" }; + let name = if keep_names { + s.name().clone() + } else { + PlSmallStr::const_default() + }; Box::new(list.amortized_iter_with_name(name)) }, } @@ -68,7 +84,7 @@ struct LitIter { impl LitIter { /// # Safety /// Caller must ensure the given `logical` dtype belongs to `array`. - unsafe fn new(array: ArrayRef, len: usize, logical: &DataType, name: &str) -> Self { + unsafe fn new(array: ArrayRef, len: usize, logical: &DataType, name: PlSmallStr) -> Self { let series_container = Rc::new(Series::from_chunks_and_dtype_unchecked( name, vec![array], @@ -117,7 +133,7 @@ struct FlatIter { impl FlatIter { /// # Safety /// Caller must ensure the given `logical` dtype belongs to `array`. - unsafe fn new(chunks: &[ArrayRef], len: usize, logical: &DataType, name: &str) -> Self { + unsafe fn new(chunks: &[ArrayRef], len: usize, logical: &DataType, name: PlSmallStr) -> Self { let mut stack = Vec::with_capacity(chunks.len()); for chunk in chunks.iter().rev() { stack.push(chunk.clone()) diff --git a/crates/polars-expr/src/expressions/literal.rs b/crates/polars-expr/src/expressions/literal.rs index 6b43825087a1..d46d109f1fa9 100644 --- a/crates/polars-expr/src/expressions/literal.rs +++ b/crates/polars-expr/src/expressions/literal.rs @@ -3,7 +3,7 @@ use std::ops::Deref; use polars_core::prelude::*; use polars_core::utils::NoNull; -use polars_plan::constants::LITERAL_NAME; +use polars_plan::constants::get_literal_name; use super::*; use crate::expressions::{AggregationContext, PartitionedAggregation, PhysicalExpr}; @@ -24,25 +24,25 @@ impl PhysicalExpr for LiteralExpr { use LiteralValue::*; let s = match &self.0 { #[cfg(feature = "dtype-i8")] - Int8(v) => Int8Chunked::full(LITERAL_NAME, *v, 1).into_series(), + Int8(v) => Int8Chunked::full(get_literal_name().clone(), *v, 1).into_series(), #[cfg(feature = "dtype-i16")] - Int16(v) => Int16Chunked::full(LITERAL_NAME, *v, 1).into_series(), - Int32(v) => Int32Chunked::full(LITERAL_NAME, *v, 1).into_series(), - Int64(v) => Int64Chunked::full(LITERAL_NAME, *v, 1).into_series(), + Int16(v) => Int16Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + Int32(v) => Int32Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + Int64(v) => Int64Chunked::full(get_literal_name().clone(), *v, 1).into_series(), #[cfg(feature = "dtype-u8")] - UInt8(v) => UInt8Chunked::full(LITERAL_NAME, *v, 1).into_series(), + UInt8(v) => UInt8Chunked::full(get_literal_name().clone(), *v, 1).into_series(), #[cfg(feature = "dtype-u16")] - UInt16(v) => UInt16Chunked::full(LITERAL_NAME, *v, 1).into_series(), - UInt32(v) => UInt32Chunked::full(LITERAL_NAME, *v, 1).into_series(), - UInt64(v) => UInt64Chunked::full(LITERAL_NAME, *v, 1).into_series(), - Float32(v) => Float32Chunked::full(LITERAL_NAME, *v, 1).into_series(), - Float64(v) => Float64Chunked::full(LITERAL_NAME, *v, 1).into_series(), + UInt16(v) => UInt16Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + UInt32(v) => UInt32Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + UInt64(v) => UInt64Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + Float32(v) => Float32Chunked::full(get_literal_name().clone(), *v, 1).into_series(), + Float64(v) => Float64Chunked::full(get_literal_name().clone(), *v, 1).into_series(), #[cfg(feature = "dtype-decimal")] - Decimal(v, scale) => Int128Chunked::full(LITERAL_NAME, *v, 1) + Decimal(v, scale) => Int128Chunked::full(get_literal_name().clone(), *v, 1) .into_decimal_unchecked(None, *scale) .into_series(), - Boolean(v) => BooleanChunked::full(LITERAL_NAME, *v, 1).into_series(), - Null => polars_core::prelude::Series::new_null(LITERAL_NAME, 1), + Boolean(v) => BooleanChunked::full(get_literal_name().clone(), *v, 1).into_series(), + Null => polars_core::prelude::Series::new_null(get_literal_name().clone(), 1), Range { low, high, @@ -78,27 +78,29 @@ impl PhysicalExpr for LiteralExpr { InvalidOperation: "datatype `{}` is not supported as range", dt ), }, - String(v) => StringChunked::full(LITERAL_NAME, v, 1).into_series(), - Binary(v) => BinaryChunked::full(LITERAL_NAME, v, 1).into_series(), + String(v) => StringChunked::full(get_literal_name().clone(), v, 1).into_series(), + Binary(v) => BinaryChunked::full(get_literal_name().clone(), v, 1).into_series(), #[cfg(feature = "dtype-datetime")] - DateTime(timestamp, tu, tz) => Int64Chunked::full(LITERAL_NAME, *timestamp, 1) - .into_datetime(*tu, tz.clone()) - .into_series(), + DateTime(timestamp, tu, tz) => { + Int64Chunked::full(get_literal_name().clone(), *timestamp, 1) + .into_datetime(*tu, tz.clone()) + .into_series() + }, #[cfg(feature = "dtype-duration")] - Duration(v, tu) => Int64Chunked::full(LITERAL_NAME, *v, 1) + Duration(v, tu) => Int64Chunked::full(get_literal_name().clone(), *v, 1) .into_duration(*tu) .into_series(), #[cfg(feature = "dtype-date")] - Date(v) => Int32Chunked::full(LITERAL_NAME, *v, 1) + Date(v) => Int32Chunked::full(get_literal_name().clone(), *v, 1) .into_date() .into_series(), #[cfg(feature = "dtype-time")] - Time(v) => Int64Chunked::full(LITERAL_NAME, *v, 1) + Time(v) => Int64Chunked::full(get_literal_name().clone(), *v, 1) .into_time() .into_series(), Series(series) => series.deref().clone(), lv @ (Int(_) | Float(_) | StrCat(_)) => polars_core::prelude::Series::from_any_values( - LITERAL_NAME, + get_literal_name().clone(), &[lv.to_any_value().unwrap()], false, ) @@ -124,7 +126,7 @@ impl PhysicalExpr for LiteralExpr { fn to_field(&self, _input_schema: &Schema) -> PolarsResult { let dtype = self.0.get_datatype(); - Ok(Field::new("literal", dtype)) + Ok(Field::new(PlSmallStr::from_static("literal"), dtype)) } fn is_literal(&self) -> bool { true diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs index 266d577b22ee..b66920de7ab9 100644 --- a/crates/polars-expr/src/expressions/mod.rs +++ b/crates/polars-expr/src/expressions/mod.rs @@ -615,7 +615,7 @@ impl PhysicalIoExpr for PhysicalIoHelper { self.expr.evaluate(df, &state) } - fn live_variables(&self) -> Option>> { + fn live_variables(&self) -> Option> { Some(expr_to_leaf_column_names(self.expr.as_expression()?)) } diff --git a/crates/polars-expr/src/expressions/rolling.rs b/crates/polars-expr/src/expressions/rolling.rs index 614673091f07..601901460c3f 100644 --- a/crates/polars-expr/src/expressions/rolling.rs +++ b/crates/polars-expr/src/expressions/rolling.rs @@ -13,7 +13,7 @@ pub(crate) struct RollingExpr { /// A function Expr. i.e. Mean, Median, Max, etc. pub(crate) function: Expr, pub(crate) phys_function: Arc, - pub(crate) out_name: Option>, + pub(crate) out_name: Option, pub(crate) options: RollingGroupOptions, pub(crate) expr: Expr, } @@ -45,7 +45,7 @@ impl PhysicalExpr for RollingExpr { .finalize(); polars_ensure!(out.len() == groups.len(), agg_len = out.len(), groups.len()); if let Some(name) = &self.out_name { - out.rename(name.as_ref()); + out.rename(name.clone()); } Ok(out) } diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index cc3447e1539c..b91910d1b936 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -131,9 +131,9 @@ fn sort_by_groups_no_match_single<'a>( }, _ => Ok(None), }) - .collect_ca_with_dtype("", dtype) + .collect_ca_with_dtype(PlSmallStr::const_default(), dtype) }); - let s = ca?.with_name(s_in.name()).into_series(); + let s = ca?.with_name(s_in.name().clone()).into_series(); ac_in.with_series(s, true, Some(expr))?; Ok(ac_in) } diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs index e3c2f9e833a2..8e689eec2ec0 100644 --- a/crates/polars-expr/src/expressions/ternary.rs +++ b/crates/polars-expr/src/expressions/ternary.rs @@ -53,7 +53,7 @@ fn finish_as_iters<'a>( .transpose() }) .collect::>()? - .with_name(ac_truthy.series().name()); + .with_name(ac_truthy.series().name().clone()); // Aggregation leaves only a single chunk. let arr = ca.downcast_iter().next().unwrap(); @@ -285,7 +285,7 @@ impl PhysicalExpr for TernaryExpr { // SAFETY: offsets are correct. let out = LargeListArray::new(data_type, offsets, values.clone(), None); - let mut out = ListChunked::with_chunk(truthy.name(), out); + let mut out = ListChunked::with_chunk(truthy.name().clone(), out); unsafe { out.to_logical(inner_type.clone()) }; if ac_target.series().list().unwrap()._can_fast_explode() { diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index c2ccf7028b03..eb30a073f664 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -10,7 +10,6 @@ use polars_ops::frame::join::{default_join_ids, private_left_join_multiple_keys, use polars_ops::frame::SeriesJoin; use polars_ops::prelude::*; use polars_plan::prelude::*; -use polars_utils::format_smartstring; use polars_utils::sort::perfect_sort; use polars_utils::sync::SyncPtr; use rayon::prelude::*; @@ -22,8 +21,8 @@ pub struct WindowExpr { /// This will be used to create a smaller DataFrame to prevent taking unneeded columns by index pub(crate) group_by: Vec>, pub(crate) order_by: Option<(Arc, SortOptions)>, - pub(crate) apply_columns: Vec>, - pub(crate) out_name: Option>, + pub(crate) apply_columns: Vec, + pub(crate) out_name: Option, /// A function Expr. i.e. Mean, Median, Max, etc. pub(crate) function: Expr, pub(crate) phys_function: Arc, @@ -114,7 +113,7 @@ impl WindowExpr { // SAFETY: // we only have unique indices ranging from 0..len unsafe { perfect_sort(&POOL, &idx_mapping, &mut take_idx) }; - let idx = IdxCa::from_vec("", take_idx); + let idx = IdxCa::from_vec(PlSmallStr::const_default(), take_idx); // SAFETY: // groups should always be in bounds. @@ -175,7 +174,7 @@ impl WindowExpr { let first = group.first(); let group = group_by_columns .iter() - .map(|s| format_smartstring!("{}", s.get(first as usize).unwrap())) + .map(|s| format!("{}", s.get(first as usize).unwrap())) .collect::>(); polars_bail!( expr = self.expr, ComputeError: @@ -407,7 +406,11 @@ impl PhysicalExpr for WindowExpr { if df.is_empty() { let field = self.phys_function.to_field(&df.schema())?; - return Ok(Series::full_null(field.name(), 0, field.data_type())); + return Ok(Series::full_null( + field.name().clone(), + 0, + field.data_type(), + )); } let group_by_columns = self @@ -497,11 +500,7 @@ impl PhysicalExpr for WindowExpr { }; // 2. create GroupBy object and apply aggregation - let apply_columns = self - .apply_columns - .iter() - .map(|s| s.as_ref().to_string()) - .collect(); + let apply_columns = self.apply_columns.clone(); // some window expressions need sorted groups // to make sure that the caches align we sort @@ -526,7 +525,7 @@ impl PhysicalExpr for WindowExpr { let mut out = ac.flat_naive().into_owned(); cache_gb(gb, state, &cache_key); if let Some(name) = &self.out_name { - out.rename(name.as_ref()); + out.rename(name.clone()); } Ok(out) }, @@ -534,7 +533,7 @@ impl PhysicalExpr for WindowExpr { let mut out = ac.aggregated().explode()?; cache_gb(gb, state, &cache_key); if let Some(name) = &self.out_name { - out.rename(name.as_ref()); + out.rename(name.clone()); } Ok(out) }, @@ -616,7 +615,7 @@ impl PhysicalExpr for WindowExpr { let mut out = materialize_column(&join_opt_ids, &out_column); if let Some(name) = &self.out_name { - out.rename(name.as_ref()); + out.rename(name.clone()); } if state.cache_window() { @@ -747,7 +746,7 @@ where // SAFETY: we have written all slots unsafe { values.set_len(len) } - ChunkedArray::new_vec(ca.name(), values).into_series() + ChunkedArray::new_vec(ca.name().clone(), values).into_series() } else { // We don't use a mutable bitmap as bits will have have race conditions! // A single byte might alias if we write from single threads. @@ -825,6 +824,6 @@ where values.into(), Some(validity), ); - Series::try_from((ca.name(), arr.boxed())).unwrap() + Series::try_from((ca.name().clone(), arr.boxed())).unwrap() } } diff --git a/crates/polars-expr/src/planner.rs b/crates/polars-expr/src/planner.rs index de713e22611b..e09b274afa9f 100644 --- a/crates/polars-expr/src/planner.rs +++ b/crates/polars-expr/src/planner.rs @@ -251,9 +251,9 @@ fn create_physical_expr_inner( if apply_columns.is_empty() { if has_aexpr(function, expr_arena, |e| matches!(e, AExpr::Literal(_))) { - apply_columns.push(Arc::from("literal")) + apply_columns.push(PlSmallStr::from_static("literal")) } else if has_aexpr(function, expr_arena, |e| matches!(e, AExpr::Len)) { - apply_columns.push(Arc::from("len")) + apply_columns.push(PlSmallStr::from_static("len")) } else { let e = node_to_expr(function, expr_arena); polars_bail!( diff --git a/crates/polars-expr/src/reduce/convert.rs b/crates/polars-expr/src/reduce/convert.rs index f5a33aca1a0b..cfbecdfb2014 100644 --- a/crates/polars-expr/src/reduce/convert.rs +++ b/crates/polars-expr/src/reduce/convert.rs @@ -5,26 +5,15 @@ use polars_utils::arena::{Arena, Node}; use super::extrema::*; use super::sum::SumReduce; use super::*; +use crate::reduce::len::LenReduce; use crate::reduce::mean::MeanReduce; -pub fn can_convert_into_reduction(node: Node, expr_arena: &Arena) -> bool { - match expr_arena.get(node) { - AExpr::Agg(agg) => matches!( - agg, - IRAggExpr::Min { .. } - | IRAggExpr::Max { .. } - | IRAggExpr::Mean { .. } - | IRAggExpr::Sum(_) - ), - _ => false, - } -} - +/// Converts a node into a reduction + its associated selector expression. pub fn into_reduction( node: Node, - expr_arena: &Arena, + expr_arena: &mut Arena, schema: &Schema, -) -> PolarsResult, Node)>> { +) -> PolarsResult<(Box, Node)> { let e = expr_arena.get(node); let field = e.to_field(schema, Context::Default, expr_arena)?; let out = match expr_arena.get(node) { @@ -74,9 +63,20 @@ pub fn into_reduction( let out: Box = Box::new(MeanReduce::new(field.dtype.clone())); (out, *input) }, - _ => return Ok(None), + _ => unreachable!(), + }, + AExpr::Len => { + // Compute length on the first column, or if none exist we'll never + // be called and correctly return 0 as length anyway. + let out: Box = Box::new(LenReduce::new()); + let expr = if let Some(first_column) = schema.iter_names().next() { + expr_arena.add(AExpr::Column(first_column.as_str().into())) + } else { + expr_arena.add(AExpr::Literal(LiteralValue::Null)) + }; + (out, expr) }, - _ => return Ok(None), + _ => unreachable!(), }; - Ok(Some(out)) + Ok(out) } diff --git a/crates/polars-expr/src/reduce/len.rs b/crates/polars-expr/src/reduce/len.rs new file mode 100644 index 000000000000..bf9391e8fd33 --- /dev/null +++ b/crates/polars-expr/src/reduce/len.rs @@ -0,0 +1,45 @@ +use polars_core::error::constants::LENGTH_LIMIT_MSG; + +use super::*; + +#[derive(Clone)] +pub struct LenReduce { + len: u64, +} + +impl LenReduce { + pub(crate) fn new() -> Self { + Self { len: 0 } + } +} + +impl Reduction for LenReduce { + fn init_dyn(&self) -> Box { + Box::new(Self::new()) + } + + fn reset(&mut self) { + self.len = 0; + } + + fn update(&mut self, batch: &Series) -> PolarsResult<()> { + self.len += batch.len() as u64; + Ok(()) + } + + fn combine(&mut self, other: &dyn Reduction) -> PolarsResult<()> { + let other = other.as_any().downcast_ref::().unwrap(); + self.len += other.len; + Ok(()) + } + + fn finalize(&mut self) -> PolarsResult { + #[allow(clippy::useless_conversion)] + let as_idx: IdxSize = self.len.try_into().expect(LENGTH_LIMIT_MSG); + Ok(Scalar::new(IDX_DTYPE, as_idx.into())) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/crates/polars-expr/src/reduce/mean.rs b/crates/polars-expr/src/reduce/mean.rs index 0d06974d956b..4c5fe635a442 100644 --- a/crates/polars-expr/src/reduce/mean.rs +++ b/crates/polars-expr/src/reduce/mean.rs @@ -1,5 +1,3 @@ -use polars_core::utils::Container; - use super::*; #[derive(Clone)] diff --git a/crates/polars-expr/src/reduce/mod.rs b/crates/polars-expr/src/reduce/mod.rs index bb51ba5c8a8d..9c7bb4f6f6bc 100644 --- a/crates/polars-expr/src/reduce/mod.rs +++ b/crates/polars-expr/src/reduce/mod.rs @@ -1,11 +1,12 @@ mod convert; mod extrema; +mod len; mod mean; mod sum; use std::any::Any; -pub use convert::{can_convert_into_reduction, into_reduction}; +pub use convert::into_reduction; use polars_core::prelude::*; #[allow(dead_code)] diff --git a/crates/polars-expr/src/state/node_timer.rs b/crates/polars-expr/src/state/node_timer.rs index 95084eeb4fcb..8102aa8fcf83 100644 --- a/crates/polars-expr/src/state/node_timer.rs +++ b/crates/polars-expr/src/state/node_timer.rs @@ -42,20 +42,20 @@ impl NodeTimer { polars_ensure!(!ticks.is_empty(), ComputeError: "no data to time"); let start = ticks[0].0; ticks.push((self.query_start, start)); - let nodes_s = Series::new("node", nodes); + let nodes_s = Series::new(PlSmallStr::from_static("node"), nodes); let start: NoNull = ticks .iter() .map(|(start, _)| (start.duration_since(self.query_start)).as_micros() as u64) .collect(); let mut start = start.into_inner(); - start.rename("start"); + start.rename(PlSmallStr::from_static("start")); let end: NoNull = ticks .iter() .map(|(_, end)| (end.duration_since(self.query_start)).as_micros() as u64) .collect(); let mut end = end.into_inner(); - end.rename("end"); + end.rename(PlSmallStr::from_static("end")); let columns = vec![nodes_s, start.into_series(), end.into_series()]; let df = unsafe { DataFrame::new_no_checks(columns) }; diff --git a/crates/polars-ffi/src/version_0.rs b/crates/polars-ffi/src/version_0.rs index eb24542f0733..0fc29055f66d 100644 --- a/crates/polars-ffi/src/version_0.rs +++ b/crates/polars-ffi/src/version_0.rs @@ -54,7 +54,11 @@ unsafe extern "C" fn c_release_series_export(e: *mut SeriesExport) { } pub fn export_series(s: &Series) -> SeriesExport { - let field = ArrowField::new(s.name(), s.dtype().to_arrow(CompatLevel::newest()), true); + let field = ArrowField::new( + s.name().clone(), + s.dtype().to_arrow(CompatLevel::newest()), + true, + ); let schema = Box::new(ffi::export_field_to_c(&field)); let mut arrays = (0..s.chunks().len()) @@ -91,7 +95,7 @@ pub unsafe fn import_series(e: SeriesExport) -> PolarsResult { }) .collect::>>()?; - Series::try_from((field.name.as_str(), chunks)) + Series::try_from((field.name.clone(), chunks)) } /// # Safety @@ -144,7 +148,7 @@ mod test { #[test] fn test_ffi() { - let s = Series::new("a", [1, 2]); + let s = Series::new("a".into(), [1, 2]); let e = export_series(&s); unsafe { diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 4a5fd97bf689..9eb2addc8be5 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -44,7 +44,6 @@ serde = { workspace = true, features = ["rc"], optional = true } serde_json = { version = "1", optional = true } simd-json = { workspace = true, optional = true } simdutf8 = { workspace = true, optional = true } -smartstring = { workspace = true } tokio = { workspace = true, features = ["fs", "net", "rt-multi-thread", "time", "sync"], optional = true } tokio-util = { workspace = true, features = ["io", "io-util"], optional = true } url = { workspace = true, optional = true } @@ -67,7 +66,7 @@ json = [ "dtype-struct", "csv", ] -serde = ["dep:serde", "polars-core/serde-lazy", "polars-parquet/serde"] +serde = ["dep:serde", "polars-core/serde-lazy", "polars-parquet/serde", "polars-utils/serde"] # support for arrows ipc file parsing ipc = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrows streaming ipc file parsing diff --git a/crates/polars-io/src/cloud/options.rs b/crates/polars-io/src/cloud/options.rs index ca9016d05a96..efaab673f634 100644 --- a/crates/polars-io/src/cloud/options.rs +++ b/crates/polars-io/src/cloud/options.rs @@ -26,13 +26,13 @@ use polars_error::*; #[cfg(feature = "aws")] use polars_utils::cache::FastFixedCache; #[cfg(feature = "aws")] +use polars_utils::pl_str::PlSmallStr; +#[cfg(feature = "aws")] use regex::Regex; #[cfg(feature = "http")] use reqwest::header::HeaderMap; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -#[cfg(feature = "aws")] -use smartstring::alias::String as SmartString; #[cfg(feature = "cloud")] use url::Url; @@ -42,7 +42,7 @@ use crate::file_cache::get_env_file_cache_ttl; use crate::pl_async::with_concurrency_budget; #[cfg(feature = "aws")] -static BUCKET_REGION: Lazy>> = +static BUCKET_REGION: Lazy>> = Lazy::new(|| std::sync::Mutex::new(FastFixedCache::new(32))); /// The type of the config keys must satisfy the following requirements: diff --git a/crates/polars-io/src/csv/read/buffer.rs b/crates/polars-io/src/csv/read/buffer.rs index 26e9359a6000..712201ceaca6 100644 --- a/crates/polars-io/src/csv/read/buffer.rs +++ b/crates/polars-io/src/csv/read/buffer.rs @@ -147,7 +147,7 @@ where } pub struct Utf8Field { - name: String, + name: PlSmallStr, mutable: MutableBinaryViewArray, scratch: Vec, quote_char: u8, @@ -155,9 +155,14 @@ pub struct Utf8Field { } impl Utf8Field { - fn new(name: &str, capacity: usize, quote_char: Option, encoding: CsvEncoding) -> Self { + fn new( + name: PlSmallStr, + capacity: usize, + quote_char: Option, + encoding: CsvEncoding, + ) -> Self { Self { - name: name.to_string(), + name, mutable: MutableBinaryViewArray::with_capacity(capacity), scratch: vec![], quote_char: quote_char.unwrap_or(b'"'), @@ -254,7 +259,7 @@ pub struct CategoricalField { #[cfg(feature = "dtype-categorical")] impl CategoricalField { fn new( - name: &str, + name: PlSmallStr, capacity: usize, quote_char: Option, ordering: CategoricalOrdering, @@ -358,7 +363,7 @@ pub struct DatetimeField { #[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))] impl DatetimeField { - fn new(name: &str, capacity: usize) -> Self { + fn new(name: PlSmallStr, capacity: usize) -> Self { let builder = PrimitiveChunkedBuilder::::new(name, capacity); Self { compiled: None, @@ -492,6 +497,7 @@ pub fn init_buffers( .iter() .map(|&i| { let (name, dtype) = schema.get_at_index(i).unwrap(); + let name = name.clone(); let builder = match dtype { &DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)), #[cfg(feature = "dtype-i8")] @@ -625,7 +631,7 @@ impl Buffer { Buffer::Utf8(v) => { let arr = v.mutable.freeze(); - StringChunked::with_chunk(v.name.as_str(), arr).into_series() + StringChunked::with_chunk(v.name.clone(), arr).into_series() }, #[allow(unused_variables)] Buffer::Categorical(buf) => { diff --git a/crates/polars-io/src/csv/read/options.rs b/crates/polars-io/src/csv/read/options.rs index 2d10029975e2..7659565918ef 100644 --- a/crates/polars-io/src/csv/read/options.rs +++ b/crates/polars-io/src/csv/read/options.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use polars_core::datatypes::{DataType, Field}; use polars_core::schema::{IndexOfSchema, Schema, SchemaRef}; use polars_error::PolarsResult; +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -21,7 +22,7 @@ pub struct CsvReadOptions { pub n_rows: Option, pub row_index: Option, // Column-wise options - pub columns: Option>, + pub columns: Option>, pub projection: Option>>, pub schema: Option, pub schema_overwrite: Option, @@ -146,7 +147,7 @@ impl CsvReadOptions { } /// Which columns to select. - pub fn with_columns(mut self, columns: Option>) -> Self { + pub fn with_columns(mut self, columns: Option>) -> Self { self.columns = columns; self } @@ -336,7 +337,7 @@ pub enum CommentPrefix { Single(u8), /// A string that indicates the start of a comment line. /// This allows for multiple characters to be used as a comment identifier. - Multi(Arc), + Multi(PlSmallStr), } impl CommentPrefix { @@ -346,8 +347,8 @@ impl CommentPrefix { } /// Creates a new `CommentPrefix` for the `Multi` variant. - pub fn new_multi(prefix: String) -> Self { - CommentPrefix::Multi(Arc::from(prefix.as_str())) + pub fn new_multi(prefix: PlSmallStr) -> Self { + CommentPrefix::Multi(prefix) } /// Creates a new `CommentPrefix` from a `&str`. @@ -356,7 +357,7 @@ impl CommentPrefix { let c = prefix.as_bytes()[0]; CommentPrefix::Single(c) } else { - CommentPrefix::Multi(Arc::from(prefix)) + CommentPrefix::Multi(PlSmallStr::from_str(prefix)) } } } @@ -371,11 +372,11 @@ impl From<&str> for CommentPrefix { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum NullValues { /// A single value that's used for all columns - AllColumnsSingle(String), + AllColumnsSingle(PlSmallStr), /// Multiple values that are used for all columns - AllColumns(Vec), + AllColumns(Vec), /// Tuples that map column names to null value of that column - Named(Vec<(String, String)>), + Named(Vec<(PlSmallStr, PlSmallStr)>), } impl NullValues { @@ -384,7 +385,7 @@ impl NullValues { NullValues::AllColumnsSingle(v) => NullValuesCompiled::AllColumnsSingle(v), NullValues::AllColumns(v) => NullValuesCompiled::AllColumns(v), NullValues::Named(v) => { - let mut null_values = vec!["".to_string(); schema.len()]; + let mut null_values = vec![PlSmallStr::from_static(""); schema.len()]; for (name, null_value) in v { let i = schema.try_index_of(&name)?; null_values[i] = null_value; @@ -398,11 +399,11 @@ impl NullValues { #[derive(Debug, Clone)] pub(super) enum NullValuesCompiled { /// A single value that's used for all columns - AllColumnsSingle(String), + AllColumnsSingle(PlSmallStr), // Multiple null values that are null for all columns - AllColumns(Vec), + AllColumns(Vec), /// A different null value per column, computed from `NullValues::Named` - Columns(Vec), + Columns(Vec), } impl NullValuesCompiled { diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 5fb9ce29a1a1..007bc215171d 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -74,7 +74,7 @@ pub(crate) fn cast_columns( df.get_columns() .into_par_iter() .map(|s| { - if let Some(fld) = to_cast.iter().find(|fld| fld.name().as_str() == s.name()) { + if let Some(fld) = to_cast.iter().find(|fld| fld.name() == s.name()) { cast_fn(s, fld) } else { Ok(s.clone()) @@ -150,7 +150,7 @@ impl<'a> CoreReader<'a> { has_header: bool, ignore_errors: bool, schema: Option, - columns: Option>, + columns: Option>, encoding: CsvEncoding, mut n_threads: Option, schema_overwrite: Option, @@ -496,7 +496,7 @@ impl<'a> CoreReader<'a> { ) }; if let Some(ref row_index) = self.row_index { - df.insert_column(0, Series::new_empty(&row_index.name, &IDX_DTYPE))?; + df.insert_column(0, Series::new_empty(row_index.name.clone(), &IDX_DTYPE))?; } return Ok(df); } @@ -559,7 +559,7 @@ impl<'a> CoreReader<'a> { let mut local_df = unsafe { DataFrame::new_no_checks(columns) }; let current_row_count = local_df.height() as IdxSize; if let Some(rc) = &self.row_index { - local_df.with_row_index_mut(&rc.name, Some(rc.offset)); + local_df.with_row_index_mut(rc.name.clone(), Some(rc.offset)); }; cast_columns(&mut local_df, &self.to_cast, false, self.ignore_errors)?; @@ -617,7 +617,7 @@ impl<'a> CoreReader<'a> { cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; if let Some(rc) = &self.row_index { - df.with_row_index_mut(&rc.name, Some(rc.offset)); + df.with_row_index_mut(rc.name.clone(), Some(rc.offset)); } let n_read = df.height() as IdxSize; Ok((df, n_read)) @@ -666,7 +666,7 @@ impl<'a> CoreReader<'a> { cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; if let Some(rc) = &self.row_index { - df.with_row_index_mut(&rc.name, Some(rc.offset)); + df.with_row_index_mut(rc.name.clone(), Some(rc.offset)); } let n_read = df.height() as IdxSize; (df, n_read) diff --git a/crates/polars-io/src/csv/read/read_impl/batched.rs b/crates/polars-io/src/csv/read/read_impl/batched.rs index c4be765648cb..3bf6e2dd4e32 100644 --- a/crates/polars-io/src/csv/read/read_impl/batched.rs +++ b/crates/polars-io/src/csv/read/read_impl/batched.rs @@ -258,7 +258,7 @@ impl<'a> BatchedCsvReader<'a> { cast_columns(&mut df, &self.to_cast, false, self.ignore_errors)?; if let Some(rc) = &self.row_index { - df.with_row_index_mut(&rc.name, Some(rc.offset)); + df.with_row_index_mut(rc.name.clone(), Some(rc.offset)); } Ok(df) }) diff --git a/crates/polars-io/src/csv/read/reader.rs b/crates/polars-io/src/csv/read/reader.rs index c45e18f3c098..f1a155d84fd2 100644 --- a/crates/polars-io/src/csv/read/reader.rs +++ b/crates/polars-io/src/csv/read/reader.rs @@ -304,7 +304,7 @@ where let schema = dtypes .iter() .zip(df.get_column_names()) - .map(|(dtype, name)| Field::new(name, dtype.clone())) + .map(|(dtype, name)| Field::new(name.clone(), dtype.clone())) .collect::(); Arc::new(schema) diff --git a/crates/polars-io/src/csv/read/schema_inference.rs b/crates/polars-io/src/csv/read/schema_inference.rs index bdbd8296f7fe..c8870ee65b30 100644 --- a/crates/polars-io/src/csv/read/schema_inference.rs +++ b/crates/polars-io/src/csv/read/schema_inference.rs @@ -6,6 +6,7 @@ use polars_core::prelude::*; use polars_time::chunkedarray::string::infer as date_infer; #[cfg(feature = "polars-time")] use polars_time::prelude::string::Pattern; +use polars_utils::format_pl_smallstr; use polars_utils::slice::GetSaferUnchecked; use super::options::{CommentPrefix, CsvEncoding, NullValues}; @@ -129,9 +130,10 @@ pub fn infer_field_schema(string: &str, try_parse_dates: bool, decimal_comma: bo DataType::Datetime(TimeUnit::Microseconds, None) }, Pattern::DateYMD | Pattern::DateDMY => DataType::Date, - Pattern::DatetimeYMDZ => { - DataType::Datetime(TimeUnit::Microseconds, Some("UTC".to_string())) - }, + Pattern::DatetimeYMDZ => DataType::Datetime( + TimeUnit::Microseconds, + Some(PlSmallStr::from_static("UTC")), + ), }, None => DataType::String, } @@ -162,9 +164,10 @@ pub fn infer_field_schema(string: &str, try_parse_dates: bool, decimal_comma: bo DataType::Datetime(TimeUnit::Microseconds, None) }, Pattern::DateYMD | Pattern::DateDMY => DataType::Date, - Pattern::DatetimeYMDZ => { - DataType::Datetime(TimeUnit::Microseconds, Some("UTC".to_string())) - }, + Pattern::DatetimeYMDZ => DataType::Datetime( + TimeUnit::Microseconds, + Some(PlSmallStr::from_static("UTC")), + ), }, None => DataType::String, } @@ -241,7 +244,7 @@ fn infer_file_schema_inner( } // now that we've found the first non-comment line we parse the headers, or we create a header - let headers: Vec = if let Some(mut header_line) = first_line { + let headers: Vec = if let Some(mut header_line) = first_line { let len = header_line.len(); if len > 1 { // remove carriage return @@ -272,9 +275,9 @@ fn infer_file_schema_inner( for name in &headers { let count = header_names.entry(name.as_ref()).or_insert(0usize); if *count != 0 { - final_headers.push(format!("{}_duplicated_{}", name, *count - 1)) + final_headers.push(format_pl_smallstr!("{}_duplicated_{}", name, *count - 1)) } else { - final_headers.push(name.to_string()) + final_headers.push(PlSmallStr::from_str(name)) } *count += 1; } @@ -282,8 +285,8 @@ fn infer_file_schema_inner( } else { byterecord .enumerate() - .map(|(i, _s)| format!("column_{}", i + 1)) - .collect::>() + .map(|(i, _s)| format_pl_smallstr!("column_{}", i + 1)) + .collect::>() } } else if has_header && !bytes.is_empty() && recursion_count == 0 { // there was no new line char. So we copy the whole buf and add one @@ -395,7 +398,7 @@ fn infer_file_schema_inner( } }, Some(NullValues::AllColumnsSingle(name)) => { - if s.as_ref() != name { + if s.as_ref() != name.as_str() { Some(infer_field_schema(&s, try_parse_dates, decimal_comma)) } else { None @@ -405,10 +408,10 @@ fn infer_file_schema_inner( // SAFETY: // we iterate over headers length. let current_name = unsafe { headers.get_unchecked_release(i) }; - let null_name = &names.iter().find(|name| &name.0 == current_name); + let null_name = &names.iter().find(|name| name.0 == current_name); if let Some(null_name) = null_name { - if null_name.1 != s.as_ref() { + if null_name.1.as_str() != s.as_ref() { Some(infer_field_schema(&s, try_parse_dates, decimal_comma)) } else { None @@ -448,7 +451,7 @@ fn infer_file_schema_inner( if let Some(schema_overwrite) = schema_overwrite { if let Some((_, name, dtype)) = schema_overwrite.get_full(field_name) { - fields.push(Field::new(name, dtype.clone())); + fields.push(Field::new(name.clone(), dtype.clone())); continue; } @@ -456,7 +459,7 @@ fn infer_file_schema_inner( // execute only if schema is complete if schema_overwrite.len() == header_length { if let Some((name, dtype)) = schema_overwrite.get_at_index(i) { - fields.push(Field::new(name, dtype.clone())); + fields.push(Field::new(name.clone(), dtype.clone())); continue; } } @@ -464,7 +467,7 @@ fn infer_file_schema_inner( let possibilities = &column_types[i]; let dtype = finish_infer_field_schema(possibilities); - fields.push(Field::new(field_name, dtype)); + fields.push(Field::new(field_name.clone(), dtype)); } // if there is a single line after the header without an eol // we copy the bytes add an eol and rerun this function diff --git a/crates/polars-io/src/csv/write/writer.rs b/crates/polars-io/src/csv/write/writer.rs index 9369dacbe6da..32c657b6e1a6 100644 --- a/crates/polars-io/src/csv/write/writer.rs +++ b/crates/polars-io/src/csv/write/writer.rs @@ -49,9 +49,13 @@ where if self.bom { write_bom(&mut self.buffer)?; } - let names = df.get_column_names(); + let names = df + .get_column_names() + .into_iter() + .map(|x| x.as_str()) + .collect::>(); if self.header { - write_header(&mut self.buffer, &names, &self.options)?; + write_header(&mut self.buffer, names.as_slice(), &self.options)?; } write( &mut self.buffer, @@ -193,8 +197,16 @@ impl BatchedWriter { if !self.has_written_header { self.has_written_header = true; - let names = df.get_column_names(); - write_header(&mut self.writer.buffer, &names, &self.writer.options)?; + let names = df + .get_column_names() + .into_iter() + .map(|x| x.as_str()) + .collect::>(); + write_header( + &mut self.writer.buffer, + names.as_slice(), + &self.writer.options, + )?; } write( @@ -216,7 +228,7 @@ impl BatchedWriter { if !self.has_written_header { self.has_written_header = true; - let names = self.schema.get_names(); + let names = self.schema.get_names_str(); write_header(&mut self.writer.buffer, &names, &self.writer.options)?; }; diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index e3c557eac1f3..aa6546c8dd5a 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -12,8 +12,8 @@ //! use std::io::Cursor; //! //! -//! let s0 = Series::new("days", &[0, 1, 2, 3, 4]); -//! let s1 = Series::new("temp", &[22.1, 19.9, 7., 2., 3.]); +//! let s0 = Series::new("days".into(), &[0, 1, 2, 3, 4]); +//! let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); //! let mut df = DataFrame::new(vec![s0, s1]).unwrap(); //! //! // Create an in memory file handler. @@ -81,7 +81,7 @@ pub struct IpcReader { pub(super) projection: Option>, pub(crate) columns: Option>, hive_partition_columns: Option>, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, pub(super) row_index: Option, // Stores the as key semaphore to make sure we don't write to the memory mapped file. pub(super) memory_map: Option, @@ -136,7 +136,7 @@ impl IpcReader { pub fn with_include_file_path( mut self, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, ) -> Self { self.include_file_path = include_file_path; self @@ -300,7 +300,7 @@ impl SerReader for IpcReader { if let Some((col, value)) = include_file_path { unsafe { - df.with_column_unchecked(StringChunked::full(&col, &value, row_count).into_series()) + df.with_column_unchecked(StringChunked::full(col, &value, row_count).into_series()) }; } diff --git a/crates/polars-io/src/ipc/ipc_reader_async.rs b/crates/polars-io/src/ipc/ipc_reader_async.rs index 9d392575e956..88464cebe056 100644 --- a/crates/polars-io/src/ipc/ipc_reader_async.rs +++ b/crates/polars-io/src/ipc/ipc_reader_async.rs @@ -7,6 +7,7 @@ use polars_core::datatypes::IDX_DTYPE; use polars_core::frame::DataFrame; use polars_core::schema::Schema; use polars_error::{polars_bail, polars_err, to_compute_err, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use crate::cloud::{ build_object_store, object_path_from_str, CloudLocation, CloudOptions, PolarsObjectStore, @@ -27,7 +28,7 @@ pub struct IpcReaderAsync { #[derive(Default, Clone)] pub struct IpcReadOptions { // Names of the columns to include in the output. - projection: Option>, + projection: Option>, // The maximum number of rows to include in the output. row_limit: Option, @@ -40,7 +41,7 @@ pub struct IpcReadOptions { } impl IpcReadOptions { - pub fn with_projection(mut self, projection: Option>) -> Self { + pub fn with_projection(mut self, projection: Option>) -> Self { self.projection = projection; self } diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs index c8429e1b2d80..06fbc581ea30 100644 --- a/crates/polars-io/src/ipc/ipc_stream.rs +++ b/crates/polars-io/src/ipc/ipc_stream.rs @@ -13,8 +13,8 @@ //! use std::io::Cursor; //! //! -//! let s0 = Series::new("days", &[0, 1, 2, 3, 4]); -//! let s1 = Series::new("temp", &[22.1, 19.9, 7., 2., 3.]); +//! let s0 = Series::new("days".into(), &[0, 1, 2, 3, 4]); +//! let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); //! let mut df = DataFrame::new(vec![s0, s1]).unwrap(); //! //! // Create an in memory file handler. diff --git a/crates/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs index f4158abe69e7..01763edeaa59 100644 --- a/crates/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -219,7 +219,7 @@ where ignore_errors: bool, infer_schema_len: Option, batch_size: NonZeroUsize, - projection: Option>, + projection: Option>, schema: Option, schema_overwrite: Option<&'a Schema>, json_format: JsonFormat, @@ -307,7 +307,7 @@ where DataType::Struct( schema .into_iter() - .map(|(name, dt)| Field::new(&name, dt)) + .map(|(name, dt)| Field::new(name, dt)) .collect(), ) .to_arrow(CompatLevel::newest()) @@ -318,7 +318,9 @@ where let dtype = if let BorrowedValue::Array(_) = &json_value { ArrowDataType::LargeList(Box::new(arrow::datatypes::Field::new( - "item", dtype, true, + PlSmallStr::from_static("item"), + dtype, + true, ))) } else { dtype @@ -355,8 +357,8 @@ where }?; // TODO! Ensure we don't materialize the columns we don't need - if let Some(proj) = &self.projection { - out.select(proj) + if let Some(proj) = self.projection.as_deref() { + out.select(proj.iter().cloned()) } else { Ok(out) } @@ -405,7 +407,7 @@ where /// /// Setting `projection` to the columns you want to keep is more efficient than deserializing all of the columns and /// then dropping the ones you don't want. - pub fn with_projection(mut self, projection: Option>) -> Self { + pub fn with_projection(mut self, projection: Option>) -> Self { self.projection = projection; self } diff --git a/crates/polars-io/src/ndjson/buffer.rs b/crates/polars-io/src/ndjson/buffer.rs index df526dc49ec4..ff34a0aaee95 100644 --- a/crates/polars-io/src/ndjson/buffer.rs +++ b/crates/polars-io/src/ndjson/buffer.rs @@ -29,7 +29,7 @@ pub(crate) struct Buffer<'a> { impl Buffer<'_> { pub fn into_series(self) -> Series { let mut s = self.buf.into_series(); - s.rename(self.name); + s.rename(PlSmallStr::from_str(self.name)); s } @@ -201,7 +201,12 @@ fn deserialize_all<'a>( .iter() .map(|val| deserialize_all(val, inner_dtype, ignore_errors)) .collect::>()?; - let s = Series::from_any_values_and_dtype("", &vals, inner_dtype, false)?; + let s = Series::from_any_values_and_dtype( + PlSmallStr::const_default(), + &vals, + inner_dtype, + false, + )?; AnyValue::List(s) }, #[cfg(feature = "dtype-struct")] diff --git a/crates/polars-io/src/ndjson/core.rs b/crates/polars-io/src/ndjson/core.rs index 706e49c80f42..c3754f9403d1 100644 --- a/crates/polars-io/src/ndjson/core.rs +++ b/crates/polars-io/src/ndjson/core.rs @@ -36,7 +36,7 @@ where ignore_errors: bool, row_index: Option<&'a mut RowIndex>, predicate: Option>, - projection: Option>, + projection: Option>, } impl<'a, R> JsonLineReader<'a, R> @@ -67,7 +67,7 @@ where self } - pub fn with_projection(mut self, projection: Option>) -> Self { + pub fn with_projection(mut self, projection: Option>) -> Self { self.projection = projection; self } @@ -202,7 +202,7 @@ pub(crate) struct CoreJsonReader<'a> { ignore_errors: bool, row_index: Option<&'a mut RowIndex>, predicate: Option>, - projection: Option>, + projection: Option>, } impl<'a> CoreJsonReader<'a> { #[allow(clippy::too_many_arguments)] @@ -219,7 +219,7 @@ impl<'a> CoreJsonReader<'a> { ignore_errors: bool, row_index: Option<&'a mut RowIndex>, predicate: Option>, - projection: Option>, + projection: Option>, ) -> PolarsResult> { let reader_bytes = reader_bytes; @@ -314,13 +314,13 @@ impl<'a> CoreJsonReader<'a> { )?; let prepredicate_height = local_df.height() as IdxSize; - if let Some(projection) = &self.projection { - local_df = local_df.select(projection.as_ref())?; + if let Some(projection) = self.projection.as_deref() { + local_df = local_df.select(projection.iter().cloned())?; } if let Some(row_index) = row_index { local_df = local_df - .with_row_index(row_index.name.as_ref(), Some(row_index.offset))?; + .with_row_index(row_index.name.clone(), Some(row_index.offset))?; } if let Some(predicate) = &self.predicate { diff --git a/crates/polars-io/src/options.rs b/crates/polars-io/src/options.rs index 338bb819a099..4950b747d807 100644 --- a/crates/polars-io/src/options.rs +++ b/crates/polars-io/src/options.rs @@ -1,6 +1,5 @@ -use std::sync::Arc; - use polars_core::schema::SchemaRef; +use polars_utils::pl_str::PlSmallStr; use polars_utils::IdxSize; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -8,7 +7,7 @@ use serde::{Deserialize, Serialize}; #[derive(Clone, Debug, Eq, PartialEq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RowIndex { - pub name: Arc, + pub name: PlSmallStr, pub offset: IdxSize, } diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index 812011af48bf..a06b3f88a0dd 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -8,7 +8,7 @@ use polars_core::config::{get_rg_prefetch_size, verbose}; use polars_core::prelude::*; use polars_parquet::read::RowGroupMetaData; use polars_parquet::write::FileMetaData; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::sync::Mutex; @@ -165,7 +165,7 @@ pub async fn fetch_metadata( /// Download rowgroups for the column whose indexes are given in `projection`. /// We concurrently download the columns for each field. async fn download_projection( - fields: Arc<[SmartString]>, + fields: Arc<[PlSmallStr]>, row_group: RowGroupMetaData, async_reader: Arc, sender: QueueSend, @@ -182,7 +182,7 @@ async fn download_projection( // A single column can have multiple matches (structs). let iter = columns.iter().filter_map(|meta| { - if meta.descriptor().path_in_schema[0] == name.as_str() { + if meta.descriptor().path_in_schema[0] == name { let (offset, len) = meta.byte_range(); Some((offset, offset as usize..(offset + len) as usize)) } else { @@ -265,10 +265,10 @@ impl FetchRowGroupsFromObjectStore { row_group_range: Range, row_groups: &[RowGroupMetaData], ) -> PolarsResult { - let projected_fields: Option> = projection.map(|projection| { + let projected_fields: Option> = projection.map(|projection| { projection .iter() - .map(|i| SmartString::from(schema.fields[*i].name.as_str())) + .map(|i| (schema.fields[*i].name.clone())) .collect() }); diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs index 565ef53f4edd..8d9d35e7663e 100644 --- a/crates/polars-io/src/parquet/read/predicates.rs +++ b/crates/polars-io/src/parquet/read/predicates.rs @@ -8,9 +8,9 @@ impl ColumnStats { fn from_arrow_stats(stats: Statistics, field: &ArrowField) -> Self { Self::new( field.into(), - Some(Series::try_from(("", stats.null_count)).unwrap()), - Some(Series::try_from(("", stats.min_value)).unwrap()), - Some(Series::try_from(("", stats.max_value)).unwrap()), + Some(Series::try_from((PlSmallStr::const_default(), stats.null_count)).unwrap()), + Some(Series::try_from((PlSmallStr::const_default(), stats.min_value)).unwrap()), + Some(Series::try_from((PlSmallStr::const_default(), stats.max_value)).unwrap()), ) } } diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 1f8dfbd65295..3a14c39b99a9 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::VecDeque; use std::ops::{Deref, Range}; +use arrow::array::BooleanArray; use arrow::bitmap::{Bitmap, MutableBitmap}; use arrow::datatypes::ArrowSchemaRef; use polars_core::prelude::*; @@ -233,7 +234,7 @@ fn rg_to_dfs_prefiltered( row_group_end: usize, file_metadata: &FileMetaData, schema: &ArrowSchemaRef, - live_variables: Vec>, + live_variables: Vec, predicate: &dyn PhysicalIoExpr, row_index: Option, projection: &[usize], @@ -313,6 +314,20 @@ fn rg_to_dfs_prefiltered( debug_assert_eq!(live_idx_to_col_idx.len(), num_live_columns); debug_assert_eq!(dead_idx_to_col_idx.len(), num_dead_columns); + enum MaskSetting { + Auto, + Pre, + Post, + } + + let mask_setting = + std::env::var("POLARS_PQ_PREFILTERED_MASK").map_or(MaskSetting::Auto, |v| match &v[..] { + "auto" => MaskSetting::Auto, + "pre" => MaskSetting::Pre, + "post" => MaskSetting::Post, + _ => panic!("Invalid `POLARS_PQ_PREFILTERED_MASK` value '{v}'."), + }); + POOL.install(|| { // Set partitioned fields to prevent quadratic behavior. // Ensure all row groups are partitioned. @@ -354,7 +369,7 @@ fn rg_to_dfs_prefiltered( let mask = s.bool().expect("filter predicates was not of type boolean"); if let Some(rc) = &row_index { - df.with_row_index_mut(&rc.name, Some(rg.row_offset + rc.offset)); + df.with_row_index_mut(rc.name.clone(), Some(rg.row_offset + rc.offset)); } df = df.filter(mask)?; @@ -394,38 +409,34 @@ fn rg_to_dfs_prefiltered( return Ok(dfs.into_iter().map(|(_, df)| df).collect()); } - // @TODO: Incorporate this if we how we can properly use it. The problem here is that - // different columns really have a different cost when it comes to collecting them. We - // would need a cost model to properly estimate this. - // - // // For bitmasks that are seemingly random (i.e. not clustered or biased towards 0 or 1), - // // filtering with a bitmask in the Parquet reader is actually around 1.5 - 2.2 times slower - // // than collecting everything and filtering afterwards. This is because stopping and - // // starting decoding is not free. - // // - // // To combat this we try to detect here how biased our data is. We do this with a bithack - // // that estimates the amount of switches from 0 to 1 and from 1 to 0. This can be SIMD-ed - // // very well and gives us quite good estimate of how random our bitmask is. Then, we select - // // the filter if the bitmask is not that random. - // let do_filter_rg = dfs - // .par_iter() - // .map(|(mask, _)| { - // let iter = mask.fast_iter_u64(); - // - // // The iter is TrustedLen so the size_hint is exact. - // let num_items = iter.size_hint().0; - // let num_switches = iter - // .map(|v| (v ^ v.rotate_right(1)).count_ones() as u64) - // .sum::(); - // - // // We ignore the iter remainder since we only really care about the average. - // let avg_num_switches_per_element = num_switches / num_items as u64; - // - // // We select the filter if the average amount of switches per 64 elements is less - // // than or equal to 2. - // avg_num_switches_per_element <= 2 - // }) - // .collect::>(); + let rg_prefilter_costs = matches!(mask_setting, MaskSetting::Auto) + .then(|| { + dfs.par_iter() + .map(|(mask, _)| { + let num_edges = mask.num_edges() as f64; + let rg_len = mask.len() as f64; + + // @GB: I did quite some analysis on this. + // + // Pre-filtered and Post-filtered can both be faster in certain scenarios. + // + // - Pre-filtered is faster when there is some amount of clustering or + // sorting involved or if the number of values selected is small. + // - Post-filtering is faster when the predicate selects a somewhat random + // elements throughout the row group. + // + // The following is a heuristic value to try and estimate which one is + // faster. Essentially, it sees how many times it needs to switch between + // skipping items and collecting items and compares it against the number + // of values that it will collect. + // + // Closer to 0: post-filtering is probably better. + // Closer to 1: pre-filtering is probably better. + (num_edges / rg_len).clamp(0.0, 1.0) + }) + .collect::>() + }) + .unwrap_or_default(); let mut rg_columns = (0..dfs.len() * num_dead_columns) .into_par_iter() @@ -444,20 +455,58 @@ fn rg_to_dfs_prefiltered( } let field_md = part_mds[rg_idx as usize].get_partitions(name).unwrap(); - column_idx_to_series( - col_idx, - field_md.as_slice(), - Some(Filter::new_masked(mask.clone())), - schema, - store, - ) + let pre = || { + column_idx_to_series( + col_idx, + field_md.as_slice(), + Some(Filter::new_masked(mask.clone())), + schema, + store, + ) + }; + let post = || { + let array = + column_idx_to_series(col_idx, field_md.as_slice(), None, schema, store)?; + + debug_assert_eq!(array.len(), mask.len()); + + let mask_arr = BooleanArray::new(ArrowDataType::Boolean, mask.clone(), None); + let mask_arr = BooleanChunked::from(mask_arr); + array.filter(&mask_arr) + }; + + let array = match mask_setting { + MaskSetting::Auto => { + // Prefiltering is more expensive for nested types so we make the cut-off + // higher. + let is_nested = schema.fields[col_idx].data_type.is_nested(); + let prefilter_cost = rg_prefilter_costs[i / num_dead_columns]; + + // We empirically selected these numbers. + let do_prefilter = (is_nested && prefilter_cost <= 0.01) + || (!is_nested && prefilter_cost <= 0.02); + + if do_prefilter { + pre()? + } else { + post()? + } + }, + MaskSetting::Pre => pre()?, + MaskSetting::Post => post()?, + }; + + debug_assert_eq!(array.len(), mask.set_bits()); + + Ok(array) }) .collect::>>()?; let Some(df) = dfs.first().map(|(_, df)| df) else { return Ok(Vec::new()); }; - let rearranged_schema = df.schema(); + let mut rearranged_schema = df.schema(); + rearranged_schema.merge(Schema::from(schema)); rg_columns .par_chunks_exact_mut(num_dead_columns) @@ -465,10 +514,12 @@ fn rg_to_dfs_prefiltered( .map(|(rg_cols, (_, mut df))| { let rg_cols = rg_cols.iter_mut().map(std::mem::take).collect::>(); + debug_assert!(rg_cols.iter().all(|v| v.len() == df.height())); + // We first add the columns with the live columns at the start. Then, we do a // projections that puts the columns at the right spot. df._add_columns(rg_cols, &rearranged_schema)?; - let df = df.select(schema.get_names())?; + let df = df.select(schema.get_names_owned())?; PolarsResult::Ok(df) }) @@ -560,7 +611,7 @@ fn rg_to_dfs_optionally_par_over_columns( let mut df = unsafe { DataFrame::new_no_checks(columns) }; if let Some(rc) = &row_index { - df.with_row_index_mut(&rc.name, Some(*previous_row_count + rc.offset)); + df.with_row_index_mut(rc.name.clone(), Some(*previous_row_count + rc.offset)); } materialize_hive_partitions(&mut df, schema.as_ref(), hive_partition_columns, rg_slice.1); @@ -674,7 +725,10 @@ fn rg_to_dfs_par_over_rg( let mut df = unsafe { DataFrame::new_no_checks(columns) }; if let Some(rc) = &row_index { - df.with_row_index_mut(&rc.name, Some(row_count_start as IdxSize + rc.offset)); + df.with_row_index_mut( + rc.name.clone(), + Some(row_count_start as IdxSize + rc.offset), + ); } materialize_hive_partitions( @@ -921,7 +975,7 @@ impl BatchedParquetReader { chunk_size: usize, use_statistics: bool, hive_partition_columns: Option>, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, mut parallel: ParallelStrategy, ) -> PolarsResult { let n_row_groups = metadata.row_groups.len(); @@ -961,7 +1015,7 @@ impl BatchedParquetReader { use_statistics, hive_partition_columns: hive_partition_columns.map(Arc::from), include_file_path: include_file_path - .map(|(col, path)| StringChunked::full(&col, &path, 1)), + .map(|(col, path)| StringChunked::full(col, &path, 1)), has_returned: false, }) } diff --git a/crates/polars-io/src/parquet/read/reader.rs b/crates/polars-io/src/parquet/read/reader.rs index 30eb593191eb..f5b52437dd82 100644 --- a/crates/polars-io/src/parquet/read/reader.rs +++ b/crates/polars-io/src/parquet/read/reader.rs @@ -38,7 +38,7 @@ pub struct ParquetReader { metadata: Option, predicate: Option>, hive_partition_columns: Option>, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, use_statistics: bool, } @@ -134,7 +134,7 @@ impl ParquetReader { pub fn with_include_file_path( mut self, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, ) -> Self { self.include_file_path = include_file_path; self @@ -234,7 +234,7 @@ impl SerReader for ParquetReader { unsafe { df.with_column_unchecked( StringChunked::full( - col, + col.clone(), value, if df.width() > 0 { df.height() } else { n_rows }, ) @@ -259,7 +259,7 @@ pub struct ParquetAsyncReader { row_index: Option, use_statistics: bool, hive_partition_columns: Option>, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, schema: Option, parallel: ParallelStrategy, } @@ -362,7 +362,7 @@ impl ParquetAsyncReader { pub fn with_include_file_path( mut self, - include_file_path: Option<(Arc, Arc)>, + include_file_path: Option<(PlSmallStr, Arc)>, ) -> Self { self.include_file_path = include_file_path; self diff --git a/crates/polars-io/src/parquet/read/utils.rs b/crates/polars-io/src/parquet/read/utils.rs index bb476a5fad08..34cc752dd782 100644 --- a/crates/polars-io/src/parquet/read/utils.rs +++ b/crates/polars-io/src/parquet/read/utils.rs @@ -20,7 +20,7 @@ pub fn materialize_empty_df( let mut df = DataFrame::empty_with_arrow_schema(&schema); if let Some(row_index) = row_index { - df.insert_column(0, Series::new_empty(&row_index.name, &IDX_DTYPE)) + df.insert_column(0, Series::new_empty(row_index.name.clone(), &IDX_DTYPE)) .unwrap(); } diff --git a/crates/polars-io/src/partition.rs b/crates/polars-io/src/partition.rs index 98508cc14e5a..901e2036c102 100644 --- a/crates/polars-io/src/partition.rs +++ b/crates/polars-io/src/partition.rs @@ -28,18 +28,20 @@ impl WriteDataFrameToFile for IpcWriterOptions { } } -/// Write a partitioned parquet dataset. This functionality is unstable. -pub fn write_partitioned_dataset( +fn write_partitioned_dataset_impl( df: &mut DataFrame, path: &Path, - partition_by: &[S], - file_write_options: &O, + partition_by: Vec, + file_write_options: &W, chunk_size: usize, ) -> PolarsResult<()> where - S: AsRef, - O: WriteDataFrameToFile + Send + Sync, + W: WriteDataFrameToFile + Send + Sync, { + let partition_by = partition_by + .into_iter() + .map(Into::into) + .collect::>(); // Ensure we have a single chunk as the gather will otherwise rechunk per group. df.as_single_chunk_par(); @@ -184,3 +186,23 @@ where Ok(()) } + +/// Write a partitioned parquet dataset. This functionality is unstable. +pub fn write_partitioned_dataset( + df: &mut DataFrame, + path: &Path, + partition_by: I, + file_write_options: &W, + chunk_size: usize, +) -> PolarsResult<()> +where + I: IntoIterator, + S: Into, + W: WriteDataFrameToFile + Send + Sync, +{ + let partition_by = partition_by + .into_iter() + .map(Into::into) + .collect::>(); + write_partitioned_dataset_impl(df, path, partition_by, file_write_options, chunk_size) +} diff --git a/crates/polars-io/src/predicates.rs b/crates/polars-io/src/predicates.rs index 08ad7685461c..8acfc304a1a8 100644 --- a/crates/polars-io/src/predicates.rs +++ b/crates/polars-io/src/predicates.rs @@ -8,7 +8,7 @@ pub trait PhysicalIoExpr: Send + Sync { fn evaluate_io(&self, df: &DataFrame) -> PolarsResult; /// Get the variables that are used in the expression i.e. live variables. - fn live_variables(&self) -> Option>>; + fn live_variables(&self) -> Option>; /// Can take &dyn Statistics and determine of a file should be /// read -> `true` @@ -94,7 +94,7 @@ impl ColumnStats { } } - pub fn field_name(&self) -> &SmartString { + pub fn field_name(&self) -> &PlSmallStr { self.field.name() } diff --git a/crates/polars-io/src/shared.rs b/crates/polars-io/src/shared.rs index 735490b0bcb3..1e13f9ed68c1 100644 --- a/crates/polars-io/src/shared.rs +++ b/crates/polars-io/src/shared.rs @@ -68,7 +68,7 @@ pub(crate) fn finish_reader( let mut df = DataFrame::try_from((batch, arrow_schema.fields.as_slice()))?; if let Some(rc) = &row_index { - df.with_row_index_mut(&rc.name, Some(current_num_rows + rc.offset)); + df.with_row_index_mut(rc.name.clone(), Some(current_num_rows + rc.offset)); } if let Some(predicate) = &predicate { @@ -100,7 +100,7 @@ pub(crate) fn finish_reader( .fields .iter() .map(|fld| { - Series::try_from((fld.name.as_str(), new_empty_array(fld.data_type.clone()))) + Series::try_from((fld.name.clone(), new_empty_array(fld.data_type.clone()))) }) .collect::>()?; DataFrame::new(empty_cols)? @@ -124,7 +124,7 @@ pub(crate) fn schema_to_arrow_checked( let fields = schema.iter_fields().map(|field| { #[cfg(feature = "object")] polars_ensure!(!matches!(field.data_type(), DataType::Object(_, _)), ComputeError: "cannot write 'Object' datatype to {}", _file_name); - Ok(field.data_type().to_arrow_field(field.name().as_str(), compat_level)) + Ok(field.data_type().to_arrow_field(field.name().clone(), compat_level)) }).collect::>>()?; Ok(ArrowSchema::from(fields)) } diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 22b3a8d82b18..0294b123687d 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -210,7 +210,7 @@ pub static BOOLEAN_RE: Lazy = Lazy::new(|| { }); pub fn materialize_projection( - with_columns: Option<&[String]>, + with_columns: Option<&[PlSmallStr]>, schema: &Schema, hive_partitions: Option<&[Series]>, has_row_index: bool, diff --git a/crates/polars-json/src/json/deserialize.rs b/crates/polars-json/src/json/deserialize.rs index 9a4c9e27d0cb..2cf49f9ce0ef 100644 --- a/crates/polars-json/src/json/deserialize.rs +++ b/crates/polars-json/src/json/deserialize.rs @@ -290,7 +290,7 @@ pub(crate) fn _deserialize<'a, A: Borrow>>( BorrowedValue::String(v) => match (tu, tz) { (_, None) => temporal_conversions::utf8_to_naive_timestamp_scalar(v, "%+", tu), (_, Some(ref tz)) => { - let tz = temporal_conversions::parse_offset(tz).unwrap(); + let tz = temporal_conversions::parse_offset(tz.as_str()).unwrap(); temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, tu) }, }, diff --git a/crates/polars-json/src/json/infer_schema.rs b/crates/polars-json/src/json/infer_schema.rs index a525334a3d8c..19c96f5cf659 100644 --- a/crates/polars-json/src/json/infer_schema.rs +++ b/crates/polars-json/src/json/infer_schema.rs @@ -2,6 +2,7 @@ use std::borrow::Borrow; use arrow::datatypes::{ArrowDataType, Field}; use indexmap::map::Entry; +use polars_utils::pl_str::PlSmallStr; use simd_json::borrowed::Object; use simd_json::{BorrowedValue, StaticNode}; @@ -30,7 +31,7 @@ fn infer_object(inner: &Object) -> PolarsResult { .map(|(key, value)| infer(value).map(|dt| (key, dt))) .map(|maybe_dt| { let (key, dt) = maybe_dt?; - Ok(Field::new(key.as_ref(), dt, true)) + Ok(Field::new(key.as_ref().into(), dt, true)) }) .collect::>>()?; Ok(ArrowDataType::Struct(fields)) @@ -51,7 +52,9 @@ fn infer_array(values: &[BorrowedValue]) -> PolarsResult { }; Ok(ArrowDataType::LargeList(Box::new(Field::new( - ITEM_NAME, dt, true, + PlSmallStr::from_static(ITEM_NAME), + dt, + true, )))) } @@ -110,7 +113,7 @@ pub(crate) fn coerce_data_type>(datatypes: &[A]) -> Arr .into_iter() .map(|(name, dts)| { let dts = dts.into_iter().collect::>(); - Field::new(name, coerce_data_type(&dts), true) + Field::new(name.into(), coerce_data_type(&dts), true) }) .collect(); return Struct(fields); @@ -126,18 +129,12 @@ pub(crate) fn coerce_data_type>(datatypes: &[A]) -> Arr }) .collect(); return LargeList(Box::new(Field::new( - ITEM_NAME, + PlSmallStr::from_static(ITEM_NAME), coerce_data_type(inner_types.as_slice()), true, ))); } else if datatypes.len() > 2 { - return datatypes - .iter() - .map(|dt| dt.borrow().clone()) - .reduce(|a, b| coerce_data_type(&[a, b])) - .unwrap() - .borrow() - .clone(); + return coerce_data_type(datatypes); } let (lhs, rhs) = (datatypes[0].borrow(), datatypes[1].borrow()); @@ -145,15 +142,27 @@ pub(crate) fn coerce_data_type>(datatypes: &[A]) -> Arr (lhs, rhs) if lhs == rhs => lhs.clone(), (LargeList(lhs), LargeList(rhs)) => { let inner = coerce_data_type(&[lhs.data_type(), rhs.data_type()]); - LargeList(Box::new(Field::new(ITEM_NAME, inner, true))) + LargeList(Box::new(Field::new( + PlSmallStr::from_static(ITEM_NAME), + inner, + true, + ))) }, (scalar, LargeList(list)) => { let inner = coerce_data_type(&[scalar, list.data_type()]); - LargeList(Box::new(Field::new(ITEM_NAME, inner, true))) + LargeList(Box::new(Field::new( + PlSmallStr::from_static(ITEM_NAME), + inner, + true, + ))) }, (LargeList(list), scalar) => { let inner = coerce_data_type(&[scalar, list.data_type()]); - LargeList(Box::new(Field::new(ITEM_NAME, inner, true))) + LargeList(Box::new(Field::new( + PlSmallStr::from_static(ITEM_NAME), + inner, + true, + ))) }, (Float64, Int64) => Float64, (Int64, Float64) => Float64, diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index e784049cd78a..03fcc0d8b2c8 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -29,7 +29,6 @@ memchr = { workspace = true } once_cell = { workspace = true } pyo3 = { workspace = true, optional = true } rayon = { workspace = true } -smartstring = { workspace = true } tokio = { workspace = true, optional = true } [dev-dependencies] @@ -231,6 +230,7 @@ serde = [ "polars-time?/serde", "polars-io/serde", "polars-ops/serde", + "polars-utils/serde", ] fused = ["polars-plan/fused", "polars-ops/fused"] list_sets = ["polars-plan/list_sets", "polars-ops/list_sets"] diff --git a/crates/polars-lazy/src/dsl/eval.rs b/crates/polars-lazy/src/dsl/eval.rs index 8b62a417b932..879f31301c89 100644 --- a/crates/polars-lazy/src/dsl/eval.rs +++ b/crates/polars-lazy/src/dsl/eval.rs @@ -13,7 +13,7 @@ pub(crate) fn eval_field_to_dtype(f: &Field, expr: &Expr, list: bool) -> Field { .cloned() .unwrap_or_else(|| f.data_type().clone()); - let df = Series::new_empty("", &dtype).into_frame(); + let df = Series::new_empty(PlSmallStr::const_default(), &dtype).into_frame(); #[cfg(feature = "python")] let out = { @@ -27,12 +27,12 @@ pub(crate) fn eval_field_to_dtype(f: &Field, expr: &Expr, list: bool) -> Field { Ok(out) => { let dtype = out.get_columns()[0].dtype(); if list { - Field::new(f.name(), DataType::List(Box::new(dtype.clone()))) + Field::new(f.name().clone(), DataType::List(Box::new(dtype.clone()))) } else { - Field::new(f.name(), dtype.clone()) + Field::new(f.name().clone(), dtype.clone()) } }, - Err(_) => Field::new(f.name(), DataType::Null), + Err(_) => Field::new(f.name().clone(), DataType::Null), } } @@ -46,8 +46,8 @@ pub trait ExprEvalExtension: IntoExpr + Sized { let this = self.into_expr(); let expr2 = expr.clone(); let func = move |mut s: Series| { - let name = s.name().to_string(); - s.rename(""); + let name = s.name().clone(); + s.rename(PlSmallStr::const_default()); // Ensure we get the new schema. let output_field = eval_field_to_dtype(s.field().as_ref(), &expr, false); @@ -107,7 +107,7 @@ pub trait ExprEvalExtension: IntoExpr + Sized { }) .collect::>>()? }; - let s = Series::new(&name, avs); + let s = Series::new(name, avs); if s.dtype() != output_field.data_type() { s.cast(output_field.data_type()).map(Some) diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs index 34df33c10c50..0ef5769725d5 100644 --- a/crates/polars-lazy/src/dsl/list.rs +++ b/crates/polars-lazy/src/dsl/list.rs @@ -50,7 +50,12 @@ fn run_per_sublist( parallel: bool, output_field: Field, ) -> PolarsResult> { - let phys_expr = prepare_expression_for_context("", expr, lst.inner_dtype(), Context::Default)?; + let phys_expr = prepare_expression_for_context( + PlSmallStr::const_default(), + expr, + lst.inner_dtype(), + Context::Default, + )?; let state = ExecutionState::new(); @@ -72,7 +77,7 @@ fn run_per_sublist( } }) }) - .collect_ca_with_dtype("", output_field.dtype.clone()); + .collect_ca_with_dtype(PlSmallStr::const_default(), output_field.dtype.clone()); err = m_err.into_inner().unwrap(); ca } else { @@ -99,7 +104,7 @@ fn run_per_sublist( return Err(err); } - ca.rename(s.name()); + ca.rename(s.name().clone()); if ca.dtype() != output_field.data_type() { ca.cast(output_field.data_type()).map(Some) @@ -109,7 +114,7 @@ fn run_per_sublist( } fn run_on_group_by_engine( - name: &str, + name: PlSmallStr, lst: &ListChunked, expr: &Expr, ) -> PolarsResult> { @@ -118,14 +123,19 @@ fn run_on_group_by_engine( let groups = offsets_to_groups(arr.offsets()).unwrap(); // List elements in a series. - let values = Series::try_from(("", arr.values().clone())).unwrap(); + let values = Series::try_from((PlSmallStr::const_default(), arr.values().clone())).unwrap(); let inner_dtype = lst.inner_dtype(); // SAFETY: // Invariant in List means values physicals can be cast to inner dtype let values = unsafe { values.cast_unchecked(inner_dtype).unwrap() }; let df_context = values.into_frame(); - let phys_expr = prepare_expression_for_context("", expr, inner_dtype, Context::Aggregation)?; + let phys_expr = prepare_expression_for_context( + PlSmallStr::const_default(), + expr, + inner_dtype, + Context::Aggregation, + )?; let state = ExecutionState::new(); let mut ac = phys_expr.evaluate_on_groups(&df_context, &groups, &state)?; @@ -173,7 +183,10 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { // ensure we get the new schema let output_field = eval_field_to_dtype(lst.ref_field(), &expr, true); if lst.is_empty() { - return Ok(Some(Series::new_empty(s.name(), output_field.data_type()))); + return Ok(Some(Series::new_empty( + s.name().clone(), + output_field.data_type(), + ))); } if lst.null_count() == lst.len() { return Ok(Some(s.cast(output_field.data_type())?)); @@ -187,7 +200,7 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { }; if fits_idx_size && s.null_count() == 0 && !is_user_apply() { - run_on_group_by_engine(s.name(), &lst, &expr) + run_on_group_by_engine(s.name().clone(), &lst, &expr) } else { run_per_sublist(s, &lst, &expr, parallel, output_field) } diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 4f838d2dccfb..94c15296825e 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -37,7 +37,7 @@ use polars_mem_engine::{create_physical_plan, Executor}; use polars_ops::frame::JoinCoalesce; pub use polars_plan::frame::{AllowedOptimizations, OptFlags}; use polars_plan::global::FETCH_ROWS; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use crate::frame::cached_arenas::CachedArena; #[cfg(feature = "streaming")] @@ -293,14 +293,11 @@ impl LazyFrame { /// } /// ``` /// See [`SortMultipleOptions`] for more options. - pub fn sort(self, by: impl IntoVec, sort_options: SortMultipleOptions) -> Self { + pub fn sort(self, by: impl IntoVec, sort_options: SortMultipleOptions) -> Self { let opt_state = self.get_opt_state(); let lp = self .get_plan_builder() - .sort( - by.into_vec().into_iter().map(|x| col(&x)).collect(), - sort_options, - ) + .sort(by.into_vec().into_iter().map(col).collect(), sort_options) .build(); Self::from_logical_plan(lp, opt_state) } @@ -380,7 +377,7 @@ impl LazyFrame { /// } /// ``` pub fn reverse(self) -> Self { - self.select(vec![col("*").reverse()]) + self.select(vec![col(PlSmallStr::from_static("*")).reverse()]) } /// Rename columns in the DataFrame. @@ -398,8 +395,8 @@ impl LazyFrame { { let iter = existing.into_iter(); let cap = iter.size_hint().0; - let mut existing_vec: Vec = Vec::with_capacity(cap); - let mut new_vec: Vec = Vec::with_capacity(cap); + let mut existing_vec: Vec = Vec::with_capacity(cap); + let mut new_vec: Vec = Vec::with_capacity(cap); // TODO! should this error if `existing` and `new` have different lengths? // Currently, the longer of the two is truncated. @@ -468,7 +465,7 @@ impl LazyFrame { /// /// See the method on [Series](polars_core::series::SeriesTrait::shift) for more info on the `shift` operation. pub fn shift>(self, n: E) -> Self { - self.select(vec![col("*").shift(n.into())]) + self.select(vec![col(PlSmallStr::from_static("*")).shift(n.into())]) } /// Shift the values by a given period and fill the parts that will be empty due to this operation @@ -476,7 +473,9 @@ impl LazyFrame { /// /// See the method on [Series](polars_core::series::SeriesTrait::shift) for more info on the `shift` operation. pub fn shift_and_fill, IE: Into>(self, n: E, fill_value: IE) -> Self { - self.select(vec![col("*").shift_and_fill(n.into(), fill_value.into())]) + self.select(vec![ + col(PlSmallStr::from_static("*")).shift_and_fill(n.into(), fill_value.into()) + ]) } /// Fill None values in the DataFrame with an expression. @@ -507,6 +506,8 @@ impl LazyFrame { let cast_cols: Vec = dtypes .into_iter() .map(|(name, dt)| { + let name = PlSmallStr::from_str(name); + if strict { col(name).strict_cast(dt) } else { @@ -525,9 +526,9 @@ impl LazyFrame { /// Cast all frame columns to the given dtype, resulting in a new LazyFrame pub fn cast_all(self, dtype: DataType, strict: bool) -> Self { self.with_columns(vec![if strict { - col("*").strict_cast(dtype) + col(PlSmallStr::from_static("*")).strict_cast(dtype) } else { - col("*").cast(dtype) + col(PlSmallStr::from_static("*")).cast(dtype) }]) } @@ -710,6 +711,7 @@ impl LazyFrame { // if it fails in a todo!() error if auto_new_streaming is set. let mut new_stream_lazy = self.clone(); new_stream_lazy.opt_state |= OptFlags::NEW_STREAMING; + new_stream_lazy.opt_state &= !OptFlags::STREAMING; let mut alp_plan = new_stream_lazy.to_alp_optimized()?; let stream_lp_top = alp_plan.lp_arena.add(IR::Sink { input: alp_plan.lp_top, @@ -929,7 +931,7 @@ impl LazyFrame { /// Select (and optionally rename, with [`alias`](crate::dsl::Expr::alias)) columns from the query. /// /// Columns can be selected with [`col`]; - /// If you want to select all columns use `col("*")`. + /// If you want to select all columns use `col(PlSmallStr::from_static("*"))`. /// /// # Example /// @@ -948,7 +950,7 @@ impl LazyFrame { /// /// This function selects all columns except "foo" /// fn exclude_a_column(df: DataFrame) -> LazyFrame { /// df.lazy() - /// .select(&[col("*").exclude(["foo"])]) + /// .select(&[col(PlSmallStr::from_static("*")).exclude(["foo"])]) /// } /// ``` pub fn select>(self, exprs: E) -> Self { @@ -1054,7 +1056,7 @@ impl LazyFrame { .to_field(&self.collect_schema().unwrap(), Context::Default) .unwrap(); return self.with_column(index_column).rolling( - Expr::Column(Arc::from(output_field.name().as_str())), + Expr::Column(output_field.name().clone()), group_by, options, ); @@ -1099,7 +1101,7 @@ impl LazyFrame { .to_field(&self.collect_schema().unwrap(), Context::Default) .unwrap(); return self.with_column(index_column).group_by_dynamic( - Expr::Column(Arc::from(output_field.name().as_str())), + Expr::Column(output_field.name().clone()), group_by, options, ); @@ -1175,7 +1177,7 @@ impl LazyFrame { /// Creates the Cartesian product from both frames, preserving the order of the left keys. #[cfg(feature = "cross_join")] - pub fn cross_join(self, other: LazyFrame, suffix: Option) -> LazyFrame { + pub fn cross_join(self, other: LazyFrame, suffix: Option) -> LazyFrame { self.join( other, vec![], @@ -1543,7 +1545,7 @@ impl LazyFrame { /// Aggregate all the columns as the sum of their null value count. pub fn null_count(self) -> LazyFrame { - self.select(vec![col("*").null_count()]) + self.select(vec![col(PlSmallStr::from_static("*")).null_count()]) } /// Drop non-unique rows and maintain the order of kept rows. @@ -1552,7 +1554,7 @@ impl LazyFrame { /// `None`, all columns are considered. pub fn unique_stable( self, - subset: Option>, + subset: Option>, keep_strategy: UniqueKeepStrategy, ) -> LazyFrame { self.unique_stable_generic(subset, keep_strategy) @@ -1714,7 +1716,7 @@ impl LazyFrame { function, optimizations, schema, - name.unwrap_or("ANONYMOUS UDF"), + PlSmallStr::from_static(name.unwrap_or("ANONYMOUS UDF")), ) .build(); Self::from_logical_plan(lp, opt_state) @@ -1750,15 +1752,20 @@ impl LazyFrame { /// # Warning /// This can have a negative effect on query performance. This may for instance block /// predicate pushdown optimization. - pub fn with_row_index(mut self, name: &str, offset: Option) -> LazyFrame { + pub fn with_row_index(mut self, name: S, offset: Option) -> LazyFrame + where + S: Into, + { + let name = name.into(); let add_row_index_in_map = match &mut self.logical_plan { DslPlan::Scan { file_options: options, scan_type, .. } if !matches!(scan_type, FileScan::Anonymous { .. }) => { + let name = name.clone(); options.row_index = Some(RowIndex { - name: Arc::from(name), + name, offset: offset.unwrap_or(0), }); false @@ -1767,10 +1774,7 @@ impl LazyFrame { }; if add_row_index_in_map { - self.map_private(DslFunction::RowIndex { - name: Arc::from(name), - offset, - }) + self.map_private(DslFunction::RowIndex { name, offset }) } else { self } @@ -1778,7 +1782,7 @@ impl LazyFrame { /// Return the number of non-null elements for each column. pub fn count(self) -> LazyFrame { - self.select(vec![col("*").count()]) + self.select(vec![col(PlSmallStr::from_static("*")).count()]) } /// Unnest the given `Struct` columns: the fields of the `Struct` type will be @@ -1798,11 +1802,15 @@ impl LazyFrame { } #[cfg(feature = "merge_sorted")] - pub fn merge_sorted(self, other: LazyFrame, key: &str) -> PolarsResult { + pub fn merge_sorted(self, other: LazyFrame, key: S) -> PolarsResult + where + S: Into, + { // The two DataFrames are temporary concatenated // this indicates until which chunk the data is from the left df // this trick allows us to reuse the `Union` architecture to get map over // two DataFrames + let key = key.into(); let left = self.map_private(DslFunction::FunctionIR(FunctionIR::Rechunk)); let q = concat( &[left, other], @@ -1814,7 +1822,7 @@ impl LazyFrame { )?; Ok( q.map_private(DslFunction::FunctionIR(FunctionIR::MergeSorted { - column: Arc::from(key), + column: key, })), ) } @@ -1847,7 +1855,7 @@ impl LazyGroupBy { /// Group by and aggregate. /// /// Select a column with [col] and choose an aggregation. - /// If you want to aggregate all columns use `col("*")`. + /// If you want to aggregate all columns use `col(PlSmallStr::from_static("*"))`. /// /// # Example /// @@ -1894,8 +1902,13 @@ impl LazyGroupBy { .filter_map(|expr| expr_output_name(expr).ok()) .collect::>(); - self.agg([col("*").exclude(&keys).head(n)]) - .explode_impl([col("*").exclude(&keys)], true) + self.agg([col(PlSmallStr::from_static("*")) + .exclude(keys.iter().cloned()) + .head(n)]) + .explode_impl( + [col(PlSmallStr::from_static("*")).exclude(keys.iter().cloned())], + true, + ) } /// Return last n rows of each group @@ -1906,8 +1919,13 @@ impl LazyGroupBy { .filter_map(|expr| expr_output_name(expr).ok()) .collect::>(); - self.agg([col("*").exclude(&keys).tail(n)]) - .explode_impl([col("*").exclude(&keys)], true) + self.agg([col(PlSmallStr::from_static("*")) + .exclude(keys.iter().cloned()) + .tail(n)]) + .explode_impl( + [col(PlSmallStr::from_static("*")).exclude(keys.iter().cloned())], + true, + ) } /// Apply a function over the groups as a new DataFrame. @@ -1949,7 +1967,7 @@ pub struct JoinBuilder { right_on: Vec, allow_parallel: bool, force_parallel: bool, - suffix: Option, + suffix: Option, validation: JoinValidation, coalesce: JoinCoalesce, join_nulls: bool, @@ -2035,8 +2053,11 @@ impl JoinBuilder { /// Suffix to add duplicate column names in join. /// Defaults to `"_right"` if this method is never called. - pub fn suffix>(mut self, suffix: S) -> Self { - self.suffix = Some(suffix.as_ref().to_string()); + pub fn suffix(mut self, suffix: S) -> Self + where + S: Into, + { + self.suffix = Some(suffix.into()); self } diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs index eedcdc700e1e..f559a15d260f 100644 --- a/crates/polars-lazy/src/frame/pivot.rs +++ b/crates/polars-lazy/src/frame/pivot.rs @@ -21,14 +21,19 @@ impl PhysicalAggExpr for PivotExpr { fn evaluate(&self, df: &DataFrame, groups: &GroupsProxy) -> PolarsResult { let state = ExecutionState::new(); let dtype = df.get_columns()[0].dtype(); - let phys_expr = prepare_expression_for_context("", &self.0, dtype, Context::Aggregation)?; + let phys_expr = prepare_expression_for_context( + PlSmallStr::const_default(), + &self.0, + dtype, + Context::Aggregation, + )?; phys_expr .evaluate_on_groups(df, groups, &state) .map(|mut ac| ac.aggregated()) } - fn root_name(&self) -> PolarsResult<&str> { - Ok("") + fn root_name(&self) -> PolarsResult<&PlSmallStr> { + Ok(PlSmallStr::empty_static()) } } @@ -46,9 +51,9 @@ where I0: IntoIterator, I1: IntoIterator, I2: IntoIterator, - S0: AsRef, - S1: AsRef, - S2: AsRef, + S0: Into, + S1: Into, + S2: Into, { // make sure that the root column is replaced let agg_expr = agg_expr.map(|agg_expr| { @@ -72,9 +77,9 @@ where I0: IntoIterator, I1: IntoIterator, I2: IntoIterator, - S0: AsRef, - S1: AsRef, - S2: AsRef, + S0: Into, + S1: Into, + S2: Into, { // make sure that the root column is replaced let agg_expr = agg_expr.map(|agg_expr| { diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs index 46d1304a0b96..024f2a26bffb 100644 --- a/crates/polars-lazy/src/lib.rs +++ b/crates/polars-lazy/src/lib.rs @@ -61,7 +61,7 @@ //! assert!(new.column("new_column") //! .unwrap() //! .equals( -//! &Series::new("new_column", &[50, 40, 30, 20, 10]) +//! &Series::new("new_column".into(), &[50, 40, 30, 20, 10]) //! ) //! ); //! ``` @@ -94,7 +94,7 @@ //! assert!(new.column("new_column") //! .unwrap() //! .equals( -//! &Series::new("new_column", &[100, 100, 3, 4, 5]) +//! &Series::new("new_column".into(), &[100, 100, 3, 4, 5]) //! ) //! ); //! ``` @@ -147,7 +147,7 @@ //! col("column_a") //! // apply a custom closure Series => Result //! .map(|_s| { -//! Ok(Some(Series::new("", &[6.0f32, 6.0, 6.0, 6.0, 6.0]))) +//! Ok(Some(Series::new("".into(), &[6.0f32, 6.0, 6.0, 6.0, 6.0]))) //! }, //! // return type of the closure //! GetOutput::from_type(DataType::Float64)).alias("new_column") diff --git a/crates/polars-lazy/src/physical_plan/exotic.rs b/crates/polars-lazy/src/physical_plan/exotic.rs index 0e2a68d9f562..5950f9f52b47 100644 --- a/crates/polars-lazy/src/physical_plan/exotic.rs +++ b/crates/polars-lazy/src/physical_plan/exotic.rs @@ -6,14 +6,14 @@ use crate::prelude::*; #[cfg(feature = "pivot")] pub(crate) fn prepare_eval_expr(expr: Expr) -> Expr { expr.map_expr(|e| match e { - Expr::Column(_) => Expr::Column(Arc::from("")), - Expr::Nth(_) => Expr::Column(Arc::from("")), + Expr::Column(_) => Expr::Column(PlSmallStr::const_default()), + Expr::Nth(_) => Expr::Column(PlSmallStr::const_default()), e => e, }) } pub(crate) fn prepare_expression_for_context( - name: &str, + name: PlSmallStr, expr: &Expr, dtype: &DataType, ctxt: Context, diff --git a/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs index 92103b3d8170..8776c46060ba 100644 --- a/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs @@ -26,7 +26,7 @@ impl PhysicalIoExpr for Wrap { }; h.evaluate_io(df) } - fn live_variables(&self) -> Option>> { + fn live_variables(&self) -> Option> { // @TODO: This should not unwrap Some(expr_to_leaf_column_names(self.0.as_expression()?)) } diff --git a/crates/polars-lazy/src/scan/anonymous_scan.rs b/crates/polars-lazy/src/scan/anonymous_scan.rs index 8b83046693da..4c3d9a03e723 100644 --- a/crates/polars-lazy/src/scan/anonymous_scan.rs +++ b/crates/polars-lazy/src/scan/anonymous_scan.rs @@ -42,7 +42,7 @@ impl LazyFrame { .into(); if let Some(rc) = args.row_index { - lf = lf.with_row_index(&rc.name, Some(rc.offset)) + lf = lf.with_row_index(rc.name.clone(), Some(rc.offset)) }; Ok(lf) diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index c9892cfab5ff..54e9c77e2480 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -19,7 +19,7 @@ pub struct LazyCsvReader { cache: bool, read_options: CsvReadOptions, cloud_options: Option, - include_file_paths: Option>, + include_file_paths: Option, } #[cfg(feature = "csv")] @@ -120,13 +120,13 @@ impl LazyCsvReader { /// Set the comment prefix for this instance. Lines starting with this prefix will be ignored. #[must_use] - pub fn with_comment_prefix(self, comment_prefix: Option<&str>) -> Self { + pub fn with_comment_prefix(self, comment_prefix: Option) -> Self { self.map_parse_options(|opts| { - opts.with_comment_prefix(comment_prefix.map(|s| { + opts.with_comment_prefix(comment_prefix.clone().map(|s| { if s.len() == 1 && s.chars().next().unwrap().is_ascii() { CommentPrefix::Single(s.as_bytes()[0]) } else { - CommentPrefix::Multi(Arc::from(s)) + CommentPrefix::Multi(s) } })) }) @@ -263,7 +263,7 @@ impl LazyCsvReader { Ok(self.with_schema(Some(Arc::new(schema)))) } - pub fn with_include_file_paths(mut self, include_file_paths: Option>) -> Self { + pub fn with_include_file_paths(mut self, include_file_paths: Option) -> Self { self.include_file_paths = include_file_paths; self } diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index 9c716afa060c..f7b91d427200 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -48,7 +48,7 @@ pub trait LazyFileListReader: Clone { lf = lf.slice(0, n_rows as IdxSize) }; if let Some(rc) = self.row_index() { - lf = lf.with_row_index(&rc.name, Some(rc.offset)) + lf = lf.with_row_index(rc.name.clone(), Some(rc.offset)) }; Ok(lf) diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index ec9a1a584ffc..9d981bc74c0e 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -16,7 +16,7 @@ pub struct ScanArgsIpc { pub memory_map: bool, pub cloud_options: Option, pub hive_options: HiveOptions, - pub include_file_paths: Option>, + pub include_file_paths: Option, } impl Default for ScanArgsIpc { diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 6cb4a8c8cae7..0effd26d5497 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -23,7 +23,7 @@ pub struct LazyJsonLineReader { pub(crate) infer_schema_length: Option, pub(crate) n_rows: Option, pub(crate) ignore_errors: bool, - pub(crate) include_file_paths: Option>, + pub(crate) include_file_paths: Option, pub(crate) cloud_options: Option, } @@ -109,7 +109,7 @@ impl LazyJsonLineReader { self } - pub fn with_include_file_paths(mut self, include_file_paths: Option>) -> Self { + pub fn with_include_file_paths(mut self, include_file_paths: Option) -> Self { self.include_file_paths = include_file_paths; self } diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index b2ee670828a9..e87e90e3330a 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -20,7 +20,7 @@ pub struct ScanArgsParquet { pub cache: bool, /// Expand path given via globbing rules. pub glob: bool, - pub include_file_paths: Option>, + pub include_file_paths: Option, } impl Default for ScanArgsParquet { @@ -80,7 +80,7 @@ impl LazyFileListReader for LazyParquetReader { // It's a bit hacky, but this row_index function updates the schema. if let Some(row_index) = row_index { - lf = lf.with_row_index(&row_index.name, Some(row_index.offset)) + lf = lf.with_row_index(row_index.name.clone(), Some(row_index.offset)) } lf.opt_state |= OptFlags::FILE_CACHING; diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index 0e67cba50566..54387451a8b7 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -6,7 +6,7 @@ use super::*; #[test] #[cfg(feature = "dtype-datetime")] fn test_agg_list_type() -> PolarsResult<()> { - let s = Series::new("foo", &[1, 2, 3]); + let s = Series::new("foo".into(), &[1, 2, 3]); let s = s.cast(&DataType::Datetime(TimeUnit::Nanoseconds, None))?; let l = unsafe { s.agg_list(&GroupsProxy::Idx(vec![(0, unitvec![0, 1, 2])].into())) }; diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs index 8c3f6e5334b2..57beafc63033 100644 --- a/crates/polars-lazy/src/tests/io.rs +++ b/crates/polars-lazy/src/tests/io.rs @@ -136,7 +136,7 @@ fn test_parquet_statistics() -> PolarsResult<()> { // issue: 13427 let out = scan_foods_parquet(par) - .filter(col("calories").is_in(lit(Series::new("", [0, 500])))) + .filter(col("calories").is_in(lit(Series::new("".into(), [0, 500])))) .collect()?; assert_eq!(out.shape(), (0, 4)); @@ -590,7 +590,7 @@ fn test_row_index_on_files() -> PolarsResult<()> { for offset in [0 as IdxSize, 10] { let lf = LazyCsvReader::new(FOODS_CSV) .with_row_index(Some(RowIndex { - name: Arc::from("index"), + name: PlSmallStr::from_static("index"), offset, })) .finish()?; @@ -665,7 +665,7 @@ fn scan_anonymous_fn_with_options() -> PolarsResult<()> { fn scan(&self, scan_opts: AnonymousScanArgs) -> PolarsResult { assert_eq!(scan_opts.with_columns.clone().unwrap().len(), 2); assert_eq!(scan_opts.n_rows, Some(3)); - let out = fruits_cars().select(scan_opts.with_columns.unwrap().as_ref())?; + let out = fruits_cars().select(scan_opts.with_columns.unwrap().iter().cloned())?; Ok(out.slice(0, scan_opts.n_rows.unwrap())) } } @@ -701,7 +701,7 @@ fn scan_small_dtypes() -> PolarsResult<()> { let df = LazyCsvReader::new(FOODS_CSV) .with_has_header(true) .with_dtype_overwrite(Some(Arc::new(Schema::from_iter([Field::new( - "sugars_g", + "sugars_g".into(), dt.clone(), )])))) .finish()? diff --git a/crates/polars-lazy/src/tests/mod.rs b/crates/polars-lazy/src/tests/mod.rs index 8b1a51212d18..f4ba3e876a65 100644 --- a/crates/polars-lazy/src/tests/mod.rs +++ b/crates/polars-lazy/src/tests/mod.rs @@ -6,14 +6,14 @@ mod cse; mod io; mod logical; mod optimization_checks; +#[cfg(all(feature = "strings", feature = "cse"))] +mod pdsh; mod predicate_queries; mod projection_queries; mod queries; mod schema; #[cfg(feature = "streaming")] mod streaming; -#[cfg(all(feature = "strings", feature = "cse"))] -mod tpch; fn get_arenas() -> (Arena, Arena) { let expr_arena = Arena::with_capacity(16); diff --git a/crates/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs index ecaaba71056d..e01ad342f061 100644 --- a/crates/polars-lazy/src/tests/optimization_checks.rs +++ b/crates/polars-lazy/src/tests/optimization_checks.rs @@ -308,7 +308,10 @@ pub fn test_predicate_block_cast() -> PolarsResult<()> { let out = lf.collect()?; let s = out.column("value").unwrap(); - assert_eq!(s, &Series::new("value", [1.0f32, 2.0])); + assert_eq!( + s, + &Series::new(PlSmallStr::from_static("value"), [1.0f32, 2.0]) + ); } Ok(()) @@ -496,7 +499,7 @@ fn test_with_column_prune() -> PolarsResult<()> { })); assert_eq!( q.collect_schema().unwrap().as_ref(), - &Schema::from_iter([Field::new("c1", DataType::Int32)]) + &Schema::from_iter([Field::new(PlSmallStr::from_static("c1"), DataType::Int32)]) ); Ok(()) } diff --git a/crates/polars-lazy/src/tests/tpch.rs b/crates/polars-lazy/src/tests/pdsh.rs similarity index 83% rename from crates/polars-lazy/src/tests/tpch.rs rename to crates/polars-lazy/src/tests/pdsh.rs index 49eed184f72a..426b19506684 100644 --- a/crates/polars-lazy/src/tests/tpch.rs +++ b/crates/polars-lazy/src/tests/pdsh.rs @@ -1,10 +1,10 @@ -//! The tpch files only got ten rows, so after all the joins filters there is not data +//! The PDSH files only got ten rows, so after all the joins filters there is not data //! Still we can use this to test the schema, operation correctness on empty data, and optimizations //! taken. use super::*; const fn base_path() -> &'static str { - "../../examples/datasets/tpc_heads" + "../../examples/datasets/pds_heads" } fn region() -> LazyFrame { @@ -98,14 +98,14 @@ fn test_q2() -> PolarsResult<()> { let out = q.collect()?; let schema = Schema::from_iter([ - Field::new("s_acctbal", DataType::Float64), - Field::new("s_name", DataType::String), - Field::new("n_name", DataType::String), - Field::new("p_partkey", DataType::Int64), - Field::new("p_mfgr", DataType::String), - Field::new("s_address", DataType::String), - Field::new("s_phone", DataType::String), - Field::new("s_comment", DataType::String), + Field::new("s_acctbal".into(), DataType::Float64), + Field::new("s_name".into(), DataType::String), + Field::new("n_name".into(), DataType::String), + Field::new("p_partkey".into(), DataType::Int64), + Field::new("p_mfgr".into(), DataType::String), + Field::new("s_address".into(), DataType::String), + Field::new("s_phone".into(), DataType::String), + Field::new("s_comment".into(), DataType::String), ]); assert_eq!(&out.schema(), &schema); diff --git a/crates/polars-lazy/src/tests/predicate_queries.rs b/crates/polars-lazy/src/tests/predicate_queries.rs index 815f2b75febe..855d9463f814 100644 --- a/crates/polars-lazy/src/tests/predicate_queries.rs +++ b/crates/polars-lazy/src/tests/predicate_queries.rs @@ -48,7 +48,7 @@ fn test_issue_2472() -> PolarsResult<()> { .extract(lit(r"(\d+-){4}(\w+)-"), 2) .cast(DataType::Int32) .alias("age"); - let predicate = col("age").is_in(lit(Series::new("", [2i32]))); + let predicate = col("age").is_in(lit(Series::new("".into(), [2i32]))); let out = base .clone() diff --git a/crates/polars-lazy/src/tests/projection_queries.rs b/crates/polars-lazy/src/tests/projection_queries.rs index 43a6088f4efb..b2cff519c05a 100644 --- a/crates/polars-lazy/src/tests/projection_queries.rs +++ b/crates/polars-lazy/src/tests/projection_queries.rs @@ -128,7 +128,10 @@ fn concat_str_regex_expansion() -> PolarsResult<()> { .select([concat_str([col(r"^b_a_\d$")], ";", false).alias("concatenated")]) .collect()?; let s = out.column("concatenated")?; - assert_eq!(s, &Series::new("concatenated", ["a--;;", ";b--;", ";;c--"])); + assert_eq!( + s, + &Series::new("concatenated".into(), ["a--;;", ";b--;", ";;c--"]) + ); Ok(()) } diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index f54854e79f36..49d7aa120ea4 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -217,7 +217,10 @@ fn test_lazy_ternary_and_predicates() { let new = ldf.collect().unwrap(); let length = new.column("sepal_length").unwrap(); - assert_eq!(length, &Series::new("sepal_length", &[5.1f64, 5.0, 5.4])); + assert_eq!( + length, + &Series::new("sepal_length".into(), &[5.1f64, 5.0, 5.4]) + ); assert_eq!(new.shape(), (3, 6)); } @@ -344,7 +347,7 @@ fn test_lazy_query_8() -> PolarsResult<()> { let mut selection = vec![]; - for c in &["A", "B", "C", "D", "E"] { + for &c in &["A", "B", "C", "D", "E"] { let e = when(col(c).is_in(col("E"))) .then(col("A")) .otherwise(Null {}.lit()) @@ -412,7 +415,7 @@ fn test_lazy_query_10() { use polars_core::export::chrono::Duration as ChronoDuration; let date = NaiveDate::from_ymd_opt(2021, 3, 5).unwrap(); let x: Series = DatetimeChunked::from_naive_datetime( - "x", + "x".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 0, 0).unwrap()), NaiveDateTime::new(date, NaiveTime::from_hms_opt(13, 0, 0).unwrap()), @@ -422,7 +425,7 @@ fn test_lazy_query_10() { ) .into(); let y: Series = DatetimeChunked::from_naive_datetime( - "y", + "y".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(11, 0, 0).unwrap()), NaiveDateTime::new(date, NaiveTime::from_hms_opt(11, 0, 0).unwrap()), @@ -438,7 +441,7 @@ fn test_lazy_query_10() { .collect() .unwrap(); let z: Series = DurationChunked::from_duration( - "z", + "z".into(), [ ChronoDuration::try_hours(1).unwrap(), ChronoDuration::try_hours(2).unwrap(), @@ -449,7 +452,7 @@ fn test_lazy_query_10() { .into(); assert!(out.column("z").unwrap().equals(&z)); let x: Series = DatetimeChunked::from_naive_datetime( - "x", + "x".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(2, 0, 0).unwrap()), NaiveDateTime::new(date, NaiveTime::from_hms_opt(3, 0, 0).unwrap()), @@ -459,7 +462,7 @@ fn test_lazy_query_10() { ) .into(); let y: Series = DatetimeChunked::from_naive_datetime( - "y", + "y".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(1, 0, 0).unwrap()), NaiveDateTime::new(date, NaiveTime::from_hms_opt(1, 0, 0).unwrap()), @@ -498,8 +501,8 @@ fn test_lazy_query_7() { ]; let data = vec![Some(1.), Some(2.), Some(3.), Some(4.), None, None]; let df = DataFrame::new(vec![ - DatetimeChunked::from_naive_datetime("date", dates, TimeUnit::Nanoseconds).into(), - Series::new("data", data), + DatetimeChunked::from_naive_datetime("date".into(), dates, TimeUnit::Nanoseconds).into(), + Series::new("data".into(), data), ]) .unwrap(); // this tests if predicate pushdown not interferes with the shift data. @@ -520,7 +523,7 @@ fn test_lazy_query_7() { #[test] fn test_lazy_shift_and_fill_all() { let data = &[1, 2, 3]; - let df = DataFrame::new(vec![Series::new("data", data)]).unwrap(); + let df = DataFrame::new(vec![Series::new("data".into(), data)]).unwrap(); let out = df .lazy() .with_column(col("data").shift(lit(1)).fill_null(lit(0)).alias("output")) @@ -711,7 +714,7 @@ fn test_lazy_group_by_apply() { df.lazy() .group_by([col("fruits")]) .agg([col("cars").apply( - |s: Series| Ok(Some(Series::new("", &[s.len() as u32]))), + |s: Series| Ok(Some(Series::new("".into(), &[s.len() as u32]))), GetOutput::from_type(DataType::UInt32), )]) .collect() @@ -1163,9 +1166,9 @@ fn test_fill_forward() -> PolarsResult<()> { let agg = out.column("b")?.list()?; let a: Series = agg.get_as_series(0).unwrap(); - assert!(a.equals(&Series::new("b", &[1, 1]))); + assert!(a.equals(&Series::new("b".into(), &[1, 1]))); let a: Series = agg.get_as_series(2).unwrap(); - assert!(a.equals(&Series::new("b", &[1, 1]))); + assert!(a.equals(&Series::new("b".into(), &[1, 1]))); let a: Series = agg.get_as_series(1).unwrap(); assert_eq!(a.null_count(), 1); Ok(()) @@ -1468,8 +1471,8 @@ fn test_singleton_broadcast() -> PolarsResult<()> { #[test] fn test_list_in_select_context() -> PolarsResult<()> { - let s = Series::new("a", &[1, 2, 3]); - let mut builder = get_list_builder(s.dtype(), s.len(), 1, s.name()).unwrap(); + let s = Series::new("a".into(), &[1, 2, 3]); + let mut builder = get_list_builder(s.dtype(), s.len(), 1, s.name().clone()).unwrap(); builder.append_series(&s).unwrap(); let expected = builder.finish().into_series(); @@ -1546,8 +1549,8 @@ fn test_round_after_agg() -> PolarsResult<()> { #[test] #[cfg(feature = "dtype-date")] fn test_fill_nan() -> PolarsResult<()> { - let s0 = Series::new("date", &[1, 2, 3]).cast(&DataType::Date)?; - let s1 = Series::new("float", &[Some(1.0), Some(f32::NAN), Some(3.0)]); + let s0 = Series::new("date".into(), &[1, 2, 3]).cast(&DataType::Date)?; + let s1 = Series::new("float".into(), &[Some(1.0), Some(f32::NAN), Some(3.0)]); let df = DataFrame::new(vec![s0, s1])?; let out = df.lazy().fill_nan(Null {}.lit()).collect()?; @@ -1694,7 +1697,7 @@ fn test_single_ranked_group() -> PolarsResult<()> { #[cfg(feature = "diff")] fn empty_df() -> PolarsResult<()> { let df = fruits_cars(); - let df = df.filter(&BooleanChunked::full("", false, df.height()))?; + let df = df.filter(&BooleanChunked::full("".into(), false, df.height()))?; df.lazy() .select([ @@ -1757,7 +1760,7 @@ fn test_is_in() -> PolarsResult<()> { let out = df .lazy() .group_by_stable([col("fruits")]) - .agg([col("cars").is_in(lit(Series::new("a", ["beetle", "vw"])))]) + .agg([col("cars").is_in(lit(Series::new("a".into(), ["beetle", "vw"])))]) .collect()?; let out = out.column("cars").unwrap(); diff --git a/crates/polars-lazy/src/tests/streaming.rs b/crates/polars-lazy/src/tests/streaming.rs index d8d76384ed0c..d76d4c90dc2e 100644 --- a/crates/polars-lazy/src/tests/streaming.rs +++ b/crates/polars-lazy/src/tests/streaming.rs @@ -264,7 +264,7 @@ fn test_streaming_left_join() -> PolarsResult<()> { #[cfg(feature = "cross_join")] fn test_streaming_slice() -> PolarsResult<()> { let vals = (0..100).collect::>(); - let s = Series::new("", vals); + let s = Series::new("".into(), vals); let lf_a = df![ "a" => s ]? diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index 3867012d3f0c..ec4a691eb547 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -153,7 +153,7 @@ fn estimate_unique_count(keys: &[Series], mut sample_size: usize) -> PolarsResul .map(|s| s.slice(offset, sample_size)) .collect::>(); let df = unsafe { DataFrame::new_no_checks(keys) }; - let names = df.get_column_names(); + let names = df.get_column_names().into_iter().cloned(); let gb = df.group_by(names).unwrap(); Ok(finish(gb.get_groups())) } diff --git a/crates/polars-mem-engine/src/executors/group_by_rolling.rs b/crates/polars-mem-engine/src/executors/group_by_rolling.rs index 437976b103a3..524fc8e63fc0 100644 --- a/crates/polars-mem-engine/src/executors/group_by_rolling.rs +++ b/crates/polars-mem-engine/src/executors/group_by_rolling.rs @@ -26,7 +26,10 @@ unsafe fn update_keys(keys: &mut [Series], groups: &GroupsProxy) { }, GroupsProxy::Slice { groups, .. } => { for key in keys.iter_mut() { - let indices = groups.iter().map(|[first, _len]| *first).collect_ca(""); + let indices = groups + .iter() + .map(|[first, _len]| *first) + .collect_ca(PlSmallStr::const_default()); *key = key.take_unchecked(&indices); } }, diff --git a/crates/polars-mem-engine/src/executors/projection_simple.rs b/crates/polars-mem-engine/src/executors/projection_simple.rs index 686321833bd2..f88ad62c8956 100644 --- a/crates/polars-mem-engine/src/executors/projection_simple.rs +++ b/crates/polars-mem-engine/src/executors/projection_simple.rs @@ -6,7 +6,7 @@ pub struct ProjectionSimple { } impl ProjectionSimple { - fn execute_impl(&mut self, df: DataFrame, columns: &[SmartString]) -> PolarsResult { + fn execute_impl(&mut self, df: DataFrame, columns: &[PlSmallStr]) -> PolarsResult { // No duplicate check as that an invariant of this node. df._select_impl_unchecked(columns.as_ref()) } @@ -15,10 +15,10 @@ impl ProjectionSimple { impl Executor for ProjectionSimple { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { state.should_stop()?; - let columns = self.columns.iter_names().cloned().collect::>(); + let columns = self.columns.get_names_owned(); let profile_name = if state.has_node_timer() { - let name = comma_delimited("simple-projection".to_string(), &columns); + let name = comma_delimited("simple-projection".to_string(), columns.as_slice()); Cow::Owned(name) } else { Cow::Borrowed("") @@ -26,9 +26,9 @@ impl Executor for ProjectionSimple { let df = self.input.execute(state)?; if state.has_node_timer() { - state.record(|| self.execute_impl(df, &columns), profile_name) + state.record(|| self.execute_impl(df, columns.as_slice()), profile_name) } else { - self.execute_impl(df, &columns) + self.execute_impl(df, columns.as_slice()) } } } diff --git a/crates/polars-mem-engine/src/executors/projection_utils.rs b/crates/polars-mem-engine/src/executors/projection_utils.rs index 1e85f272defb..979c29321cb9 100644 --- a/crates/polars-mem-engine/src/executors/projection_utils.rs +++ b/crates/polars-mem-engine/src/executors/projection_utils.rs @@ -5,7 +5,7 @@ use super::*; pub(super) fn profile_name( s: &dyn PhysicalExpr, input_schema: &Schema, -) -> PolarsResult { +) -> PolarsResult { match s.to_field(input_schema) { Err(e) => Err(e), Ok(fld) => Ok(fld.name), diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 936d602afc5f..50ed974e128b 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -105,7 +105,7 @@ impl CsvExec { let path = path.to_str().unwrap(); unsafe { df.with_column_unchecked( - StringChunked::full(col, path, df.height()).into_series(), + StringChunked::full(col.clone(), path, df.height()).into_series(), ) }; } @@ -218,7 +218,7 @@ impl CsvExec { accumulate_dataframes_vertical(dfs.into_iter().flat_map(|dfs| dfs.into_iter()))?; if let Some(row_index) = self.file_options.row_index.clone() { - df.with_row_index_mut(row_index.name.as_ref(), Some(row_index.offset)); + df.with_row_index_mut(row_index.name.clone(), Some(row_index.offset)); } df @@ -235,7 +235,9 @@ impl CsvExec { impl Executor for CsvExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.paths[0].to_string_lossy().into()]; + let mut ids = vec![PlSmallStr::from_str( + self.paths[0].to_string_lossy().as_ref(), + )]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index 574b4c43252b..18d47c172bcd 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -185,7 +185,9 @@ impl IpcExec { impl Executor for IpcExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.paths[0].to_string_lossy().into()]; + let mut ids = vec![PlSmallStr::from_str( + self.paths[0].to_string_lossy().as_ref(), + )]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/mod.rs b/crates/polars-mem-engine/src/executors/scan/mod.rs index ddc1f1b4e6e1..1b46d40b9044 100644 --- a/crates/polars-mem-engine/src/executors/scan/mod.rs +++ b/crates/polars-mem-engine/src/executors/scan/mod.rs @@ -38,7 +38,7 @@ type Predicate = Option>; #[cfg(any(feature = "ipc", feature = "parquet"))] fn prepare_scan_args( predicate: Option>, - with_columns: &mut Option>, + with_columns: &mut Option>, schema: &mut SchemaRef, has_row_index: bool, hive_partitions: Option<&[Series]>, @@ -62,7 +62,7 @@ fn prepare_scan_args( pub struct DataFrameExec { pub(crate) df: Arc, pub(crate) filter: Option>, - pub(crate) projection: Option>, + pub(crate) projection: Option>, pub(crate) predicate_has_windows: bool, } @@ -74,7 +74,7 @@ impl Executor for DataFrameExec { // projection should be before selection as those are free // TODO: this is only the case if we don't create new columns if let Some(projection) = &self.projection { - df = df.select(projection.as_slice())?; + df = df.select(projection.iter().cloned())?; } if let Some(selection) = &self.filter { diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index 5e17a289eac7..680e5cbf3bed 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -56,10 +56,12 @@ impl JsonExec { if n_rows == Some(0) { let mut df = DataFrame::empty_with_schema(schema); if let Some(col) = &self.file_scan_options.include_file_paths { - unsafe { df.with_column_unchecked(StringChunked::full_null(col, 0).into_series()) }; + unsafe { + df.with_column_unchecked(StringChunked::full_null(col.clone(), 0).into_series()) + }; } if let Some(row_index) = &self.file_scan_options.row_index { - df.with_row_index_mut(row_index.name.as_ref(), Some(row_index.offset)); + df.with_row_index_mut(row_index.name.clone(), Some(row_index.offset)); } return Ok(df); } @@ -132,7 +134,7 @@ impl JsonExec { let path = p.to_str().unwrap(); unsafe { df.with_column_unchecked( - StringChunked::full(col, path, df.height()).into_series(), + StringChunked::full(col.clone(), path, df.height()).into_series(), ) }; } @@ -148,7 +150,7 @@ impl JsonExec { impl Executor for JsonExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let ids = vec![self.paths[0].to_string_lossy().into()]; + let ids = vec![self.paths[0].to_string_lossy().clone()]; let name = comma_delimited("ndjson".to_string(), &ids); Cow::Owned(name) } else { diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index bc3f69ac95ab..bd3d87ff8832 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -482,7 +482,7 @@ impl ParquetExec { impl Executor for ParquetExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.paths[0].to_string_lossy().into()]; + let mut ids = vec![self.paths[0].to_string_lossy()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/python_scan.rs b/crates/polars-mem-engine/src/executors/scan/python_scan.rs index 1b44453b088d..270c52ea963c 100644 --- a/crates/polars-mem-engine/src/executors/scan/python_scan.rs +++ b/crates/polars-mem-engine/src/executors/scan/python_scan.rs @@ -68,7 +68,12 @@ impl Executor for PythonScanExec { self.options.python_source, PythonScanSource::Pyarrow | PythonScanSource::Cuda ) { - let args = (python_scan_function, with_columns, predicate, n_rows); + let args = ( + python_scan_function, + with_columns.map(|x| x.into_iter().map(|x| x.to_string()).collect::>()), + predicate, + n_rows, + ); callable.call1(args).map_err(to_compute_err) } else { // If there are filters, take smaller chunks to ensure we can keep memory @@ -80,7 +85,7 @@ impl Executor for PythonScanExec { }; let args = ( python_scan_function, - with_columns, + with_columns.map(|x| x.into_iter().map(|x| x.to_string()).collect::>()), predicate, n_rows, batch_size, diff --git a/crates/polars-mem-engine/src/executors/sort.rs b/crates/polars-mem-engine/src/executors/sort.rs index 820cdb65fdfd..23374abea7ac 100644 --- a/crates/polars-mem-engine/src/executors/sort.rs +++ b/crates/polars-mem-engine/src/executors/sort.rs @@ -1,3 +1,5 @@ +use polars_utils::format_pl_smallstr; + use super::*; pub(crate) struct SortExec { @@ -29,9 +31,14 @@ impl SortExec { // therefore we rename more complex expressions so that // polars core does not match these. if !matches!(e.as_expression(), Some(&Expr::Column(_))) { - s.rename(&format!("_POLARS_SORT_BY_{i}")); + s.rename(format_pl_smallstr!("_POLARS_SORT_BY_{i}")); } - polars_ensure!(s.len() == height, ShapeMismatch: "sort expressions must have same length as DataFrame, got DataFrame height: {} and Series length: {}", height, s.len()); + polars_ensure!( + s.len() == height, + ShapeMismatch: "sort expressions must have same \ + length as DataFrame, got DataFrame height: {} and Series length: {}", + height, s.len() + ); Ok(s) }) .collect::>>()?; diff --git a/crates/polars-mem-engine/src/executors/unique.rs b/crates/polars-mem-engine/src/executors/unique.rs index d9e390f1ca07..69c7b19c528a 100644 --- a/crates/polars-mem-engine/src/executors/unique.rs +++ b/crates/polars-mem-engine/src/executors/unique.rs @@ -19,7 +19,7 @@ impl Executor for UniqueExec { .options .subset .as_ref() - .map(|v| v.iter().map(|n| n.to_string()).collect::>()); + .map(|v| v.iter().cloned().collect::>()); let keep = self.options.keep_strategy; state.record( @@ -28,10 +28,12 @@ impl Executor for UniqueExec { return Ok(df); } - match self.options.maintain_order { - true => df.unique_stable(subset.as_deref(), keep, self.options.slice), - false => df.unique(subset.as_deref(), keep, self.options.slice), - } + df.unique_impl( + self.options.maintain_order, + subset, + keep, + self.options.slice, + ) }, Cow::Borrowed("unique()"), ) diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 163b45726837..c60e91af7021 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -35,7 +35,6 @@ rayon = { workspace = true } regex = { workspace = true } serde = { workspace = true, optional = true } serde_json = { workspace = true, optional = true } -smartstring = { workspace = true } unicode-reverse = { workspace = true, optional = true } [dependencies.jsonpath_lib] @@ -83,7 +82,7 @@ timezones = ["chrono", "chrono-tz", "polars-core/temporal", "polars-core/timezon random = ["rand", "rand_distr"] rank = ["rand"] find_many = ["aho-corasick"] -serde = ["dep:serde", "polars-core/serde"] +serde = ["dep:serde", "polars-core/serde", "polars-utils/serde"] # extra utilities for BinaryChunked binary_encoding = ["base64", "hex"] diff --git a/crates/polars-ops/src/chunked_array/array/any_all.rs b/crates/polars-ops/src/chunked_array/array/any_all.rs index 49bb3872d05d..270885082818 100644 --- a/crates/polars-ops/src/chunked_array/array/any_all.rs +++ b/crates/polars-ops/src/chunked_array/array/any_all.rs @@ -43,12 +43,12 @@ pub(super) fn array_all(ca: &ArrayChunked) -> PolarsResult { let chunks = ca .downcast_iter() .map(|arr| array_all_any(arr, arrow::compute::boolean::all, true)); - Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series()) + Ok(BooleanChunked::try_from_chunk_iter(ca.name().clone(), chunks)?.into_series()) } pub(super) fn array_any(ca: &ArrayChunked) -> PolarsResult { let chunks = ca .downcast_iter() .map(|arr| array_all_any(arr, arrow::compute::boolean::any, false)); - Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series()) + Ok(BooleanChunked::try_from_chunk_iter(ca.name().clone(), chunks)?.into_series()) } diff --git a/crates/polars-ops/src/chunked_array/array/count.rs b/crates/polars-ops/src/chunked_array/array/count.rs index 528a9750306c..1938244fffdd 100644 --- a/crates/polars-ops/src/chunked_array/array/count.rs +++ b/crates/polars-ops/src/chunked_array/array/count.rs @@ -8,7 +8,7 @@ use super::*; #[cfg(feature = "array_count")] pub fn array_count_matches(ca: &ArrayChunked, value: AnyValue) -> PolarsResult { - let value = Series::new("", [value]); + let value = Series::new(PlSmallStr::const_default(), [value]); let ca = ca.apply_to_inner(&|s| { ChunkCompare::<&Series>::equal_missing(&s, &value).map(|ca| ca.into_series()) diff --git a/crates/polars-ops/src/chunked_array/array/dispersion.rs b/crates/polars-ops/src/chunked_array/array/dispersion.rs index 056b1b87d09a..17924d7c38bb 100644 --- a/crates/polars-ops/src/chunked_array/array/dispersion.rs +++ b/crates/polars-ops/src/chunked_array/array/dispersion.rs @@ -5,24 +5,24 @@ pub(super) fn median_with_nulls(ca: &ArrayChunked) -> PolarsResult { DataType::Float32 => { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().median().map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, #[cfg(feature = "dtype-duration")] DataType::Duration(tu) => { let out: Int64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().median().map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(*tu).into_series() }, _ => { let out: Float64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().median())) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, }; - out.rename(ca.name()); + out.rename(ca.name().clone()); Ok(out) } @@ -31,14 +31,14 @@ pub(super) fn std_with_nulls(ca: &ArrayChunked, ddof: u8) -> PolarsResult { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().std(ddof).map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, #[cfg(feature = "dtype-duration")] DataType::Duration(tu) => { let out: Int64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().std(ddof).map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(*tu).into_series() }, _ => { @@ -50,7 +50,7 @@ pub(super) fn std_with_nulls(ca: &ArrayChunked, ddof: u8) -> PolarsResult PolarsResult { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof).map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, #[cfg(feature = "dtype-duration")] DataType::Duration(TimeUnit::Milliseconds) => { let out: Int64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof).map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(TimeUnit::Milliseconds).into_series() }, #[cfg(feature = "dtype-duration")] @@ -80,16 +80,16 @@ pub(super) fn var_with_nulls(ca: &ArrayChunked, ddof: u8) -> PolarsResult { let out: Float64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, }; - out.rename(ca.name()); + out.rename(ca.name().clone()); Ok(out) } diff --git a/crates/polars-ops/src/chunked_array/array/get.rs b/crates/polars-ops/src/chunked_array/array/get.rs index 46bf7232e390..1df931f165f7 100644 --- a/crates/polars-ops/src/chunked_array/array/get.rs +++ b/crates/polars-ops/src/chunked_array/array/get.rs @@ -11,7 +11,7 @@ fn array_get_literal(ca: &ArrayChunked, idx: i64, null_on_oob: bool) -> PolarsRe .downcast_iter() .map(|arr| sub_fixed_size_list_get_literal(arr, idx, null_on_oob)) .collect::>>()?; - Series::try_from((ca.name(), chunks)) + Series::try_from((ca.name().clone(), chunks)) .unwrap() .cast(ca.inner_dtype()) } @@ -31,7 +31,11 @@ pub fn array_get( if let Some(index) = index { array_get_literal(ca, index, null_on_oob) } else { - Ok(Series::full_null(ca.name(), ca.len(), ca.inner_dtype())) + Ok(Series::full_null( + ca.name().clone(), + ca.len(), + ca.inner_dtype(), + )) } }, len if len == ca.len() => { @@ -65,5 +69,5 @@ where .zip(rhs.downcast_iter()) .map(|(lhs_arr, rhs_arr)| op(lhs_arr, rhs_arr, null_on_oob)) .collect::>>()?; - Series::try_from((lhs.name(), chunks)) + Series::try_from((lhs.name().clone(), chunks)) } diff --git a/crates/polars-ops/src/chunked_array/array/join.rs b/crates/polars-ops/src/chunked_array/array/join.rs index 0ba4a517ca0f..426adb32826b 100644 --- a/crates/polars-ops/src/chunked_array/array/join.rs +++ b/crates/polars-ops/src/chunked_array/array/join.rs @@ -12,7 +12,7 @@ fn join_literal( }; let mut buf = String::with_capacity(128); - let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); + let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len()); ca.for_each_amortized(|opt_s| { let opt_val = opt_s.and_then(|s| { @@ -45,7 +45,7 @@ fn join_many( ignore_nulls: bool, ) -> PolarsResult { let mut buf = String::new(); - let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); + let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len()); { ca.amortized_iter() } .zip(separator) @@ -88,7 +88,7 @@ pub fn array_join( DataType::String => match separator.len() { 1 => match separator.get(0) { Some(separator) => join_literal(ca, separator, ignore_nulls), - _ => Ok(StringChunked::full_null(ca.name(), ca.len())), + _ => Ok(StringChunked::full_null(ca.name().clone(), ca.len())), }, _ => join_many(ca, separator, ignore_nulls), }, diff --git a/crates/polars-ops/src/chunked_array/array/min_max.rs b/crates/polars-ops/src/chunked_array/array/min_max.rs index bdeb76f250aa..a82de2436291 100644 --- a/crates/polars-ops/src/chunked_array/array/min_max.rs +++ b/crates/polars-ops/src/chunked_array/array/min_max.rs @@ -68,7 +68,7 @@ where } pub(super) fn array_dispatch( - name: &str, + name: PlSmallStr, values: &Series, width: usize, agg_type: AggType, diff --git a/crates/polars-ops/src/chunked_array/array/namespace.rs b/crates/polars-ops/src/chunked_array/array/namespace.rs index 1fa813be05a9..909ef5db8f6d 100644 --- a/crates/polars-ops/src/chunked_array/array/namespace.rs +++ b/crates/polars-ops/src/chunked_array/array/namespace.rs @@ -23,7 +23,7 @@ pub fn has_inner_nulls(ca: &ArrayChunked) -> bool { fn get_agg(ca: &ArrayChunked, agg_type: AggType) -> Series { let values = ca.get_inner(); let width = ca.width(); - min_max::array_dispatch(ca.name(), &values, width, agg_type) + min_max::array_dispatch(ca.name().clone(), &values, width, agg_type) } pub trait ArrayNameSpace: AsArray { @@ -149,7 +149,7 @@ pub trait ArrayNameSpace: AsArray { unsafe { ca.apply_amortized_same_type(|s| s.as_ref().shift(n)) } } else { ArrayChunked::full_null_with_dtype( - ca.name(), + ca.name().clone(), ca.len(), ca.inner_dtype(), ca.width(), diff --git a/crates/polars-ops/src/chunked_array/array/sum_mean.rs b/crates/polars-ops/src/chunked_array/array/sum_mean.rs index 60bd144317bc..27261a33eba0 100644 --- a/crates/polars-ops/src/chunked_array/array/sum_mean.rs +++ b/crates/polars-ops/src/chunked_array/array/sum_mean.rs @@ -53,7 +53,7 @@ pub(super) fn sum_array_numerical(ca: &ArrayChunked, inner_type: &DataType) -> S }) .collect::>(); - Series::try_from((ca.name(), chunks)).unwrap() + Series::try_from((ca.name().clone(), chunks)).unwrap() } pub(super) fn sum_with_nulls(ca: &ArrayChunked, inner_dtype: &DataType) -> PolarsResult { @@ -115,6 +115,6 @@ pub(super) fn sum_with_nulls(ca: &ArrayChunked, inner_dtype: &DataType) -> Polar }, } }; - out.rename(ca.name()); + out.rename(ca.name().clone()); Ok(out) } diff --git a/crates/polars-ops/src/chunked_array/array/to_struct.rs b/crates/polars-ops/src/chunked_array/array/to_struct.rs index 980135bcb169..9858ac2979a8 100644 --- a/crates/polars-ops/src/chunked_array/array/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/array/to_struct.rs @@ -1,14 +1,14 @@ use polars_core::export::rayon::prelude::*; use polars_core::POOL; -use polars_utils::format_smartstring; -use smartstring::alias::String as SmartString; +use polars_utils::format_pl_smallstr; +use polars_utils::pl_str::PlSmallStr; use super::*; -pub type ArrToStructNameGenerator = Arc SmartString + Send + Sync>; +pub type ArrToStructNameGenerator = Arc PlSmallStr + Send + Sync>; -pub fn arr_default_struct_name_gen(idx: usize) -> SmartString { - format_smartstring!("field_{idx}") +pub fn arr_default_struct_name_gen(idx: usize) -> PlSmallStr { + format_pl_smallstr!("field_{idx}") } pub trait ToStruct: AsArray { @@ -28,16 +28,19 @@ pub trait ToStruct: AsArray { (0..n_fields) .into_par_iter() .map(|i| { - ca.array_get(&Int64Chunked::from_slice("", &[i as i64]), true) - .map(|mut s| { - s.rename(&name_generator(i)); - s - }) + ca.array_get( + &Int64Chunked::from_slice(PlSmallStr::const_default(), &[i as i64]), + true, + ) + .map(|mut s| { + s.rename(name_generator(i).clone()); + s + }) }) .collect::>>() })?; - StructChunked::from_series(ca.name(), &fields) + StructChunked::from_series(ca.name().clone(), &fields) } } diff --git a/crates/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs index 6e4a29e86874..487f6a11f0df 100644 --- a/crates/polars-ops/src/chunked_array/binary/namespace.rs +++ b/crates/polars-ops/src/chunked_array/binary/namespace.rs @@ -24,7 +24,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { match lit.len() { 1 => match lit.get(0) { Some(lit) => ca.contains(lit), - None => BooleanChunked::full_null(ca.name(), ca.len()), + None => BooleanChunked::full_null(ca.name().clone(), ca.len()), }, _ => broadcast_binary_elementwise_values(ca, lit, |src, lit| find(src, lit).is_some()), } @@ -35,7 +35,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { let ca = self.as_binary(); let f = |s: &[u8]| s.ends_with(sub); let mut out: BooleanChunked = ca.into_iter().map(|opt_s| opt_s.map(f)).collect(); - out.rename(ca.name()); + out.rename(ca.name().clone()); out } @@ -44,7 +44,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { let ca = self.as_binary(); let f = |s: &[u8]| s.starts_with(sub); let mut out: BooleanChunked = ca.into_iter().map(|opt_s| opt_s.map(f)).collect(); - out.rename(ca.name()); + out.rename(ca.name().clone()); out } @@ -53,7 +53,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { match prefix.len() { 1 => match prefix.get(0) { Some(s) => self.starts_with(s), - None => BooleanChunked::full_null(ca.name(), ca.len()), + None => BooleanChunked::full_null(ca.name().clone(), ca.len()), }, _ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)), } @@ -64,7 +64,7 @@ pub trait BinaryNameSpaceImpl: AsBinary { match suffix.len() { 1 => match suffix.get(0) { Some(s) => self.ends_with(s), - None => BooleanChunked::full_null(ca.name(), ca.len()), + None => BooleanChunked::full_null(ca.name().clone(), ca.len()), }, _ => broadcast_binary_elementwise_values(ca, suffix, |s, sub| s.ends_with(sub)), } diff --git a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs index a84bef3d1534..1637dd392707 100644 --- a/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs +++ b/crates/polars-ops/src/chunked_array/datetime/replace_time_zone.rs @@ -25,7 +25,7 @@ pub fn replace_time_zone( let mut out = datetime .0 .clone() - .into_datetime(datetime.time_unit(), time_zone.map(|x| x.to_string())); + .into_datetime(datetime.time_unit(), time_zone.map(PlSmallStr::from_str)); out.set_sorted_flag(datetime.is_sorted_flag()); return Ok(out); } @@ -64,7 +64,7 @@ pub fn replace_time_zone( ) }; - let mut out = out?.into_datetime(datetime.time_unit(), time_zone.map(|x| x.to_string())); + let mut out = out?.into_datetime(datetime.time_unit(), time_zone.map(PlSmallStr::from_str)); if from_time_zone == "UTC" && ambiguous.len() == 1 && ambiguous.get(0) == Some("raise") { // In general, the sortedness flag can't be preserved. // To be safe, we only do so in the simplest case when we know for sure that there is no "daylight savings weirdness" going on, i.e.: @@ -131,7 +131,7 @@ pub fn impl_replace_time_zone( }); element_iter.try_collect_arr() }); - ChunkedArray::try_from_chunk_iter(datetime.0.name(), iter) + ChunkedArray::try_from_chunk_iter(datetime.0.name().clone(), iter) }, _ => try_binary_elementwise(datetime, ambiguous, |timestamp_opt, ambiguous_opt| { match (timestamp_opt, ambiguous_opt) { diff --git a/crates/polars-ops/src/chunked_array/gather/chunked.rs b/crates/polars-ops/src/chunked_array/gather/chunked.rs index e22a9c935176..345f3689984c 100644 --- a/crates/polars-ops/src/chunked_array/gather/chunked.rs +++ b/crates/polars-ops/src/chunked_array/gather/chunked.rs @@ -140,7 +140,7 @@ impl TakeChunked for Series { out.into_decimal_unchecked(ca.precision(), ca.scale()) .into_series() }, - Null => Series::new_null(self.name(), by.len()), + Null => Series::new_null(self.name().clone(), by.len()), _ => unreachable!(), }; unsafe { out.cast_unchecked(self.dtype()).unwrap() } @@ -197,7 +197,7 @@ impl TakeChunked for Series { out.into_decimal_unchecked(ca.precision(), ca.scale()) .into_series() }, - Null => Series::new_null(self.name(), by.len()), + Null => Series::new_null(self.name().clone(), by.len()), _ => unreachable!(), }; unsafe { out.cast_unchecked(self.dtype()).unwrap() } @@ -225,7 +225,7 @@ where }); let arr = iter.collect_arr_trusted_with_dtype(arrow_dtype); - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) } else { let targets = self.downcast_iter().collect::>(); let iter = by.iter().map(|chunk_id| { @@ -238,7 +238,7 @@ where vals.get_unchecked(array_idx as usize) }); let arr = iter.collect_arr_trusted_with_dtype(arrow_dtype); - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) }; let sorted_flag = _update_gather_sorted_flag(self.is_sorted_flag(), sorted); out.set_sorted_flag(sorted_flag); @@ -264,7 +264,7 @@ where }) .collect_arr_trusted_with_dtype(arrow_dtype); - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) } else { let targets = self.downcast_iter().collect::>(); let arr = by @@ -280,7 +280,7 @@ where }) .collect_arr_trusted_with_dtype(arrow_dtype); - ChunkedArray::with_chunk(self.name(), arr) + ChunkedArray::with_chunk(self.name().clone(), arr) } } } @@ -291,7 +291,7 @@ unsafe fn take_unchecked_object(s: &Series, by: &[ChunkId], _sorted: IsSorted) - unreachable!() }; let reg = reg.as_ref().unwrap(); - let mut builder = (*reg.builder_constructor)(s.name(), by.len()); + let mut builder = (*reg.builder_constructor)(s.name().clone(), by.len()); by.iter().for_each(|chunk_id| { let (chunk_idx, array_idx) = chunk_id.extract(); @@ -307,7 +307,7 @@ unsafe fn take_opt_unchecked_object(s: &Series, by: &[NullableChunkId]) -> Serie unreachable!() }; let reg = reg.as_ref().unwrap(); - let mut builder = (*reg.builder_constructor)(s.name(), by.len()); + let mut builder = (*reg.builder_constructor)(s.name().clone(), by.len()); by.iter().for_each(|chunk_id| { if chunk_id.is_null() { @@ -409,7 +409,7 @@ unsafe fn take_unchecked_binview( ) .maybe_gc(); - let mut out = BinaryChunked::with_chunk(ca.name(), arr); + let mut out = BinaryChunked::with_chunk(ca.name().clone(), arr); let sorted_flag = _update_gather_sorted_flag(ca.is_sorted_flag(), sorted); out.set_sorted_flag(sorted_flag); out @@ -485,7 +485,7 @@ unsafe fn take_unchecked_binview_opt(ca: &BinaryChunked, by: &[NullableChunkId]) ) .maybe_gc(); - BinaryChunked::with_chunk(ca.name(), arr) + BinaryChunked::with_chunk(ca.name().clone(), arr) } #[cfg(test)] @@ -497,15 +497,15 @@ mod test { unsafe { // # Series without nulls; let mut s_1 = Series::new( - "a", + "a".into(), &["1 loooooooooooong string", "2 loooooooooooong string"], ); let s_2 = Series::new( - "a", + "a".into(), &["11 loooooooooooong string", "22 loooooooooooong string"], ); let s_3 = Series::new( - "a", + "a".into(), &[ "111 loooooooooooong string", "222 loooooooooooong string", @@ -529,7 +529,7 @@ mod test { ]; let out = s_1.take_chunked_unchecked(&by, IsSorted::Not); - let idx = IdxCa::new("", [0, 1, 3, 2, 4, 5, 6]); + let idx = IdxCa::new("".into(), [0, 1, 3, 2, 4, 5, 6]); let expected = s_1.rechunk().take(&idx).unwrap(); assert!(out.equals(&expected)); @@ -542,16 +542,16 @@ mod test { ]; let out = s_1.take_opt_chunked_unchecked(&by); - let idx = IdxCa::new("", [None, Some(1), Some(3), Some(2)]); + let idx = IdxCa::new("".into(), [None, Some(1), Some(3), Some(2)]); let expected = s_1.rechunk().take(&idx).unwrap(); assert!(out.equals_missing(&expected)); // # Series with nulls; let mut s_1 = Series::new( - "a", + "a".into(), &["1 loooooooooooong string 1", "2 loooooooooooong string 2"], ); - let s_2 = Series::new("a", &[Some("11 loooooooooooong string 11"), None]); + let s_2 = Series::new("a".into(), &[Some("11 loooooooooooong string 11"), None]); s_1.append(&s_2).unwrap(); // ## Ids without nulls; @@ -563,7 +563,7 @@ mod test { ]; let out = s_1.take_chunked_unchecked(&by, IsSorted::Not); - let idx = IdxCa::new("", [0, 1, 3, 2]); + let idx = IdxCa::new("".into(), [0, 1, 3, 2]); let expected = s_1.rechunk().take(&idx).unwrap(); assert!(out.equals_missing(&expected)); @@ -576,7 +576,7 @@ mod test { ]; let out = s_1.take_opt_chunked_unchecked(&by); - let idx = IdxCa::new("", [None, Some(1), Some(3), Some(2)]); + let idx = IdxCa::new("".into(), [None, Some(1), Some(3), Some(2)]); let expected = s_1.rechunk().take(&idx).unwrap(); assert!(out.equals_missing(&expected)); } diff --git a/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs b/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs index ff52a6601589..5101d3668137 100644 --- a/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs +++ b/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs @@ -213,10 +213,10 @@ mod test { let idx_chunks: Vec<_> = (0..num_idx_chunks).map(|_| random_vec(&mut rng, 0..num_nonnull_elems as IdxSize, 0..200)).collect(); let null_idx_chunks: Vec<_> = idx_chunks.iter().map(|c| random_filter(&mut rng, c, 0.7..1.0)).collect(); - let nonnull_ca = UInt32Chunked::from_chunk_iter("", elem_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); - let ca = UInt32Chunked::from_chunk_iter("", null_elem_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); - let nonnull_idx_ca = IdxCa::from_chunk_iter("", idx_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); - let idx_ca = IdxCa::from_chunk_iter("", null_idx_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); + let nonnull_ca = UInt32Chunked::from_chunk_iter("".into(), elem_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); + let ca = UInt32Chunked::from_chunk_iter("".into(), null_elem_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); + let nonnull_idx_ca = IdxCa::from_chunk_iter("".into(), idx_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); + let idx_ca = IdxCa::from_chunk_iter("".into(), null_idx_chunks.iter().cloned().map(|v| v.into_iter().collect_arr())); gather_skip_nulls_check(&ca, &idx_ca); gather_skip_nulls_check(&ca, &nonnull_idx_ca); diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index 455a0c6cc921..8d7781745531 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -105,7 +105,8 @@ where if include_category { // Use AnyValue for formatting. let mut lower = AnyValue::Float64(lower_bound); - let mut categories = StringChunkedBuilder::new("category", breaks.len()); + let mut categories = + StringChunkedBuilder::new(PlSmallStr::from_static("category"), breaks.len()); let mut buf = String::new(); for br in &breaks { @@ -122,17 +123,20 @@ where fields.push(categories); }; if include_breakpoint { - fields.insert(0, Series::new("breakpoint", breaks)) + fields.insert( + 0, + Series::new(PlSmallStr::from_static("breakpoint"), breaks), + ) } - let count = Series::new("count", count); + let count = Series::new(PlSmallStr::from_static("count"), count); fields.push(count); if fields.len() == 1 { let out = fields.pop().unwrap(); - out.with_name(ca.name()) + out.with_name(ca.name().clone()) } else { - StructChunked::from_series(ca.name(), &fields) + StructChunked::from_series(ca.name().clone(), &fields) .unwrap() .into_series() } diff --git a/crates/polars-ops/src/chunked_array/list/any_all.rs b/crates/polars-ops/src/chunked_array/list/any_all.rs index 1364a872b133..431692780a45 100644 --- a/crates/polars-ops/src/chunked_array/list/any_all.rs +++ b/crates/polars-ops/src/chunked_array/list/any_all.rs @@ -41,12 +41,12 @@ pub(super) fn list_all(ca: &ListChunked) -> PolarsResult { let chunks = ca .downcast_iter() .map(|arr| list_all_any(arr, arrow::compute::boolean::all, true)); - Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series()) + Ok(BooleanChunked::try_from_chunk_iter(ca.name().clone(), chunks)?.into_series()) } pub(super) fn list_any(ca: &ListChunked) -> PolarsResult { let chunks = ca .downcast_iter() .map(|arr| list_all_any(arr, arrow::compute::boolean::any, false)); - Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series()) + Ok(BooleanChunked::try_from_chunk_iter(ca.name().clone(), chunks)?.into_series()) } diff --git a/crates/polars-ops/src/chunked_array/list/count.rs b/crates/polars-ops/src/chunked_array/list/count.rs index 4c562f1d1072..aaac148a2c8c 100644 --- a/crates/polars-ops/src/chunked_array/list/count.rs +++ b/crates/polars-ops/src/chunked_array/list/count.rs @@ -42,7 +42,7 @@ fn count_bits_set_by_offsets(values: &Bitmap, offset: &[i64]) -> Vec { #[cfg(feature = "list_count")] pub fn list_count_matches(ca: &ListChunked, value: AnyValue) -> PolarsResult { - let value = Series::new("", [value]); + let value = Series::new(PlSmallStr::const_default(), [value]); let ca = ca.apply_to_inner(&|s| { ChunkCompare::<&Series>::equal_missing(&s, &value).map(|ca| ca.into_series()) @@ -59,5 +59,5 @@ pub(super) fn count_boolean_bits(ca: &ListChunked) -> IdxCa { let out = count_bits_set_by_offsets(mask.values(), arr.offsets().as_slice()); IdxArr::from_data_default(out.into(), arr.validity().cloned()) }); - IdxCa::from_chunk_iter(ca.name(), chunks) + IdxCa::from_chunk_iter(ca.name().clone(), chunks) } diff --git a/crates/polars-ops/src/chunked_array/list/dispersion.rs b/crates/polars-ops/src/chunked_array/list/dispersion.rs index 76c4075f265b..2796ebb1de9e 100644 --- a/crates/polars-ops/src/chunked_array/list/dispersion.rs +++ b/crates/polars-ops/src/chunked_array/list/dispersion.rs @@ -5,20 +5,20 @@ pub(super) fn median_with_nulls(ca: &ListChunked) -> Series { DataType::Float32 => { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().median().map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, #[cfg(feature = "dtype-duration")] DataType::Duration(tu) => { let out: Int64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().median().map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(*tu).into_series() }, _ => { let out: Float64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().median())) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, }; @@ -29,20 +29,20 @@ pub(super) fn std_with_nulls(ca: &ListChunked, ddof: u8) -> Series { DataType::Float32 => { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().std(ddof).map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, #[cfg(feature = "dtype-duration")] DataType::Duration(tu) => { let out: Int64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().std(ddof).map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(*tu).into_series() }, _ => { let out: Float64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().std(ddof))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, }; @@ -53,14 +53,14 @@ pub(super) fn var_with_nulls(ca: &ListChunked, ddof: u8) -> Series { DataType::Float32 => { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof).map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, #[cfg(feature = "dtype-duration")] DataType::Duration(TimeUnit::Milliseconds) => { let out: Int64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof).map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(TimeUnit::Milliseconds).into_series() }, #[cfg(feature = "dtype-duration")] @@ -73,13 +73,13 @@ pub(super) fn var_with_nulls(ca: &ListChunked, ddof: u8) -> Series { .list() .unwrap() .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof).map(|v| v as i64))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_duration(TimeUnit::Milliseconds).into_series() }, _ => { let out: Float64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().var(ddof))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, }; diff --git a/crates/polars-ops/src/chunked_array/list/hash.rs b/crates/polars-ops/src/chunked_array/list/hash.rs index 70400cea873a..0c567c729041 100644 --- a/crates/polars-ops/src/chunked_array/list/hash.rs +++ b/crates/polars-ops/src/chunked_array/list/hash.rs @@ -80,6 +80,6 @@ pub(crate) fn hash(ca: &mut ListChunked, build_hasher: PlRandomState) -> UInt64C }); let mut out = out.into_inner(); - out.rename(ca.name()); + out.rename(ca.name().clone()); out } diff --git a/crates/polars-ops/src/chunked_array/list/min_max.rs b/crates/polars-ops/src/chunked_array/list/min_max.rs index 10f275f32183..8d3a4d1d4197 100644 --- a/crates/polars-ops/src/chunked_array/list/min_max.rs +++ b/crates/polars-ops/src/chunked_array/list/min_max.rs @@ -66,7 +66,7 @@ fn min_list_numerical(ca: &ListChunked, inner_type: &DataType) -> Series { }) .collect::>(); - Series::try_from((ca.name(), chunks)).unwrap() + Series::try_from((ca.name().clone(), chunks)).unwrap() } pub(super) fn list_min_function(ca: &ListChunked) -> PolarsResult { @@ -92,7 +92,7 @@ pub(super) fn list_min_function(ca: &ListChunked) -> PolarsResult { .try_apply_amortized(|s| { let s = s.as_ref(); let sc = s.min_reduce()?; - Ok(sc.into_series(s.name())) + Ok(sc.into_series(s.name().clone())) })? .explode() .unwrap() @@ -175,7 +175,7 @@ fn max_list_numerical(ca: &ListChunked, inner_type: &DataType) -> Series { }) .collect::>(); - Series::try_from((ca.name(), chunks)).unwrap() + Series::try_from((ca.name().clone(), chunks)).unwrap() } pub(super) fn list_max_function(ca: &ListChunked) -> PolarsResult { @@ -202,7 +202,7 @@ pub(super) fn list_max_function(ca: &ListChunked) -> PolarsResult { .try_apply_amortized(|s| { let s = s.as_ref(); let sc = s.max_reduce()?; - Ok(sc.into_series(s.name())) + Ok(sc.into_series(s.name().clone())) })? .explode() .unwrap() diff --git a/crates/polars-ops/src/chunked_array/list/namespace.rs b/crates/polars-ops/src/chunked_array/list/namespace.rs index 0306375af35f..02dc0fe3e68c 100644 --- a/crates/polars-ops/src/chunked_array/list/namespace.rs +++ b/crates/polars-ops/src/chunked_array/list/namespace.rs @@ -87,7 +87,7 @@ pub trait ListNameSpaceImpl: AsList { DataType::String => match separator.len() { 1 => match separator.get(0) { Some(separator) => self.join_literal(separator, ignore_nulls), - _ => Ok(StringChunked::full_null(ca.name(), ca.len())), + _ => Ok(StringChunked::full_null(ca.name().clone(), ca.len())), }, _ => self.join_many(separator, ignore_nulls), }, @@ -99,7 +99,7 @@ pub trait ListNameSpaceImpl: AsList { let ca = self.as_list(); // used to amortize heap allocs let mut buf = String::with_capacity(128); - let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); + let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len()); ca.for_each_amortized(|opt_s| { let opt_val = opt_s.and_then(|s| { @@ -135,7 +135,7 @@ pub trait ListNameSpaceImpl: AsList { let ca = self.as_list(); // used to amortize heap allocs let mut buf = String::with_capacity(128); - let mut builder = StringChunkedBuilder::new(ca.name(), ca.len()); + let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len()); { ca.amortized_iter() .zip(separator) @@ -303,7 +303,7 @@ pub trait ListNameSpaceImpl: AsList { if let Some(periods) = periods.get(0) { ca.apply_amortized(|s| s.as_ref().shift(periods)) } else { - ListChunked::full_null_with_dtype(ca.name(), ca.len(), ca.inner_dtype()) + ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), ca.inner_dtype()) } }, _ => ca.zip_and_apply_amortized(periods, |opt_s, opt_periods| { @@ -333,7 +333,7 @@ pub trait ListNameSpaceImpl: AsList { last = *o; } }); - IdxCa::from_vec(ca.name(), lengths) + IdxCa::from_vec(ca.name().clone(), lengths) } /// Get the value by index in the sublists. @@ -352,7 +352,7 @@ pub trait ListNameSpaceImpl: AsList { .collect::>(); // SAFETY: every element in list has dtype equal to its inner type unsafe { - Series::try_from((ca.name(), chunks)) + Series::try_from((ca.name().clone(), chunks)) .unwrap() .cast_unchecked(ca.inner_dtype()) } @@ -366,7 +366,7 @@ pub trait ListNameSpaceImpl: AsList { (Some(n), Some(offset)) => list_ca .apply_amortized(|s| s.as_ref().gather_every(n as usize, offset as usize)), _ => ListChunked::full_null_with_dtype( - list_ca.name(), + list_ca.name().clone(), list_ca.len(), list_ca.inner_dtype(), ), @@ -383,7 +383,7 @@ pub trait ListNameSpaceImpl: AsList { }) } else { ListChunked::full_null_with_dtype( - list_ca.name(), + list_ca.name().clone(), list_ca.len(), list_ca.inner_dtype(), ) @@ -399,7 +399,7 @@ pub trait ListNameSpaceImpl: AsList { }) } else { ListChunked::full_null_with_dtype( - list_ca.name(), + list_ca.name().clone(), list_ca.len(), list_ca.inner_dtype(), ) @@ -439,7 +439,7 @@ pub trait ListNameSpaceImpl: AsList { }) .collect::>() .map(|mut ca| { - ca.rename(list_ca.name()); + ca.rename(list_ca.name().clone()); ca.into_series() }) } @@ -466,7 +466,7 @@ pub trait ListNameSpaceImpl: AsList { }) .collect::>()? }; - out.rename(list_ca.name()); + out.rename(list_ca.name().clone()); Ok(out.into_series()) }, @@ -486,7 +486,7 @@ pub trait ListNameSpaceImpl: AsList { }) .collect::>()? }; - out.rename(list_ca.name()); + out.rename(list_ca.name().clone()); Ok(out.into_series()) } } else { @@ -526,7 +526,7 @@ pub trait ListNameSpaceImpl: AsList { }) } else { Ok(ListChunked::full_null_with_dtype( - ca.name(), + ca.name().clone(), ca.len(), ca.inner_dtype(), )) @@ -565,7 +565,7 @@ pub trait ListNameSpaceImpl: AsList { }) } else { Ok(ListChunked::full_null_with_dtype( - ca.name(), + ca.name().clone(), ca.len(), ca.inner_dtype(), )) @@ -635,7 +635,7 @@ pub trait ListNameSpaceImpl: AsList { // there was a None, so all values will be None if to_append.len() != other_len { return Ok(ListChunked::full_null_with_dtype( - ca.name(), + ca.name().clone(), length, &inner_super_type, )); @@ -650,7 +650,7 @@ pub trait ListNameSpaceImpl: AsList { &inner_super_type, ca.get_values_size() + vals_size_other + 1, length, - ca.name(), + ca.name().clone(), )?; ca.into_iter().for_each(|opt_s| { let opt_s = opt_s.map(|mut s| { @@ -687,7 +687,7 @@ pub trait ListNameSpaceImpl: AsList { &inner_super_type, ca.get_values_size() + vals_size_other + 1, length, - ca.name(), + ca.name().clone(), )?; for _ in 0..ca.len() { diff --git a/crates/polars-ops/src/chunked_array/list/sum_mean.rs b/crates/polars-ops/src/chunked_array/list/sum_mean.rs index edbe584c436a..d35089a05dda 100644 --- a/crates/polars-ops/src/chunked_array/list/sum_mean.rs +++ b/crates/polars-ops/src/chunked_array/list/sum_mean.rs @@ -62,7 +62,7 @@ pub(super) fn sum_list_numerical(ca: &ListChunked, inner_type: &DataType) -> Ser }) .collect::>(); - Series::try_from((ca.name(), chunks)).unwrap() + Series::try_from((ca.name().clone(), chunks)).unwrap() } pub(super) fn sum_with_nulls(ca: &ListChunked, inner_dtype: &DataType) -> PolarsResult { @@ -106,12 +106,16 @@ pub(super) fn sum_with_nulls(ca: &ListChunked, inner_dtype: &DataType) -> Polars }, // slowest sum_as_series path _ => ca - .try_apply_amortized(|s| s.as_ref().sum_reduce().map(|sc| sc.into_series("")))? + .try_apply_amortized(|s| { + s.as_ref() + .sum_reduce() + .map(|sc| sc.into_series(PlSmallStr::const_default())) + })? .explode() .unwrap() .into_series(), }; - out.rename(ca.name()); + out.rename(ca.name().clone()); Ok(out) } @@ -167,7 +171,7 @@ pub(super) fn mean_list_numerical(ca: &ListChunked, inner_type: &DataType) -> Se }) .collect::>(); - Series::try_from((ca.name(), chunks)).unwrap() + Series::try_from((ca.name().clone(), chunks)).unwrap() } pub(super) fn mean_with_nulls(ca: &ListChunked) -> Series { @@ -175,13 +179,13 @@ pub(super) fn mean_with_nulls(ca: &ListChunked) -> Series { DataType::Float32 => { let out: Float32Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().mean().map(|v| v as f32))) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, _ => { let out: Float64Chunked = ca .apply_amortized_generic(|s| s.and_then(|s| s.as_ref().mean())) - .with_name(ca.name()); + .with_name(ca.name().clone()); out.into_series() }, }; diff --git a/crates/polars-ops/src/chunked_array/list/to_struct.rs b/crates/polars-ops/src/chunked_array/list/to_struct.rs index 2f887c69e020..73798163ed48 100644 --- a/crates/polars-ops/src/chunked_array/list/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/list/to_struct.rs @@ -1,7 +1,7 @@ use polars_core::export::rayon::prelude::*; use polars_core::POOL; -use polars_utils::format_smartstring; -use smartstring::alias::String as SmartString; +use polars_utils::format_pl_smallstr; +use polars_utils::pl_str::PlSmallStr; use super::*; @@ -48,10 +48,10 @@ fn det_n_fields(ca: &ListChunked, n_fields: ListToStructWidthStrategy) -> usize } } -pub type NameGenerator = Arc SmartString + Send + Sync>; +pub type NameGenerator = Arc PlSmallStr + Send + Sync>; -pub fn _default_struct_name_gen(idx: usize) -> SmartString { - format_smartstring!("field_{idx}") +pub fn _default_struct_name_gen(idx: usize) -> PlSmallStr { + format_pl_smallstr!("field_{idx}") } pub trait ToStruct: AsList { @@ -73,14 +73,14 @@ pub trait ToStruct: AsList { .into_par_iter() .map(|i| { ca.lst_get(i as i64, true).map(|mut s| { - s.rename(&name_generator(i)); + s.rename(name_generator(i)); s }) }) .collect::>>() })?; - StructChunked::from_series(ca.name(), &fields) + StructChunked::from_series(ca.name().clone(), &fields) } } diff --git a/crates/polars-ops/src/chunked_array/mode.rs b/crates/polars-ops/src/chunked_array/mode.rs index 26b728306c5e..a36b161775ca 100644 --- a/crates/polars-ops/src/chunked_array/mode.rs +++ b/crates/polars-ops/src/chunked_array/mode.rs @@ -89,31 +89,32 @@ mod test { #[test] fn mode_test() { - let ca = Int32Chunked::from_slice("test", &[0, 1, 2, 3, 4, 4, 5, 6, 5, 0]); + let ca = Int32Chunked::from_slice("test".into(), &[0, 1, 2, 3, 4, 4, 5, 6, 5, 0]); let mut result = mode_primitive(&ca).unwrap().to_vec(); result.sort_by_key(|a| a.unwrap()); assert_eq!(&result, &[Some(0), Some(4), Some(5)]); - let ca = Int32Chunked::from_slice("test", &[1, 1]); + let ca = Int32Chunked::from_slice("test".into(), &[1, 1]); let mut result = mode_primitive(&ca).unwrap().to_vec(); result.sort_by_key(|a| a.unwrap()); assert_eq!(&result, &[Some(1)]); - let ca = Int32Chunked::from_slice("test", &[]); + let ca = Int32Chunked::from_slice("test".into(), &[]); let mut result = mode_primitive(&ca).unwrap().to_vec(); result.sort_by_key(|a| a.unwrap()); assert_eq!(result, &[]); - let ca = Float32Chunked::from_slice("test", &[1.0f32, 2.0, 2.0, 3.0, 3.0, 3.0]); + let ca = Float32Chunked::from_slice("test".into(), &[1.0f32, 2.0, 2.0, 3.0, 3.0, 3.0]); let result = mode_primitive(&ca).unwrap().to_vec(); assert_eq!(result, &[Some(3.0f32)]); - let ca = StringChunked::from_slice("test", &["test", "test", "test", "another test"]); + let ca = + StringChunked::from_slice("test".into(), &["test", "test", "test", "another test"]); let result = mode_primitive(&ca).unwrap(); let vec_result4: Vec> = result.into_iter().collect(); assert_eq!(vec_result4, &[Some("test")]); - let mut ca_builder = CategoricalChunkedBuilder::new("test", 5, Default::default()); + let mut ca_builder = CategoricalChunkedBuilder::new("test".into(), 5, Default::default()); ca_builder.append_value("test"); ca_builder.append_value("test"); ca_builder.append_value("test2"); diff --git a/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs index 6c811ccbbf0f..ec1d8b2c9d4f 100644 --- a/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs +++ b/crates/polars-ops/src/chunked_array/nan_propagating_aggregate.rs @@ -30,7 +30,7 @@ where .reduce(min_or_max_fn) } -pub fn nan_min_s(s: &Series, name: &str) -> Series { +pub fn nan_min_s(s: &Series, name: PlSmallStr) -> Series { match s.dtype() { DataType::Float32 => { let ca = s.f32().unwrap(); @@ -44,7 +44,7 @@ pub fn nan_min_s(s: &Series, name: &str) -> Series { } } -pub fn nan_max_s(s: &Series, name: &str) -> Series { +pub fn nan_max_s(s: &Series, name: PlSmallStr) -> Series { match s.dtype() { DataType::Float32 => { let ca = s.f32().unwrap(); diff --git a/crates/polars-ops/src/chunked_array/repeat_by.rs b/crates/polars-ops/src/chunked_array/repeat_by.rs index 8ccf9ae58141..03a0b1fe5829 100644 --- a/crates/polars-ops/src/chunked_array/repeat_by.rs +++ b/crates/polars-ops/src/chunked_array/repeat_by.rs @@ -15,7 +15,7 @@ fn check_lengths(length_srs: usize, length_by: usize) -> PolarsResult<()> { fn new_by(by: &IdxCa, len: usize) -> IdxCa { IdxCa::new( - "", + PlSmallStr::const_default(), std::iter::repeat(by.get(0).unwrap()) .take(len) .collect::>(), diff --git a/crates/polars-ops/src/chunked_array/scatter.rs b/crates/polars-ops/src/chunked_array/scatter.rs index 6e535ea60480..820989c294fe 100644 --- a/crates/polars-ops/src/chunked_array/scatter.rs +++ b/crates/polars-ops/src/chunked_array/scatter.rs @@ -143,7 +143,7 @@ impl<'a> ChunkedSet<&'a str> for &'a StringChunked { check_bounds(idx, self.len() as IdxSize)?; check_sorted(idx)?; let mut ca_iter = self.into_iter().enumerate(); - let mut builder = StringChunkedBuilder::new(self.name(), self.len()); + let mut builder = StringChunkedBuilder::new(self.name().clone(), self.len()); for (current_idx, current_value) in idx.iter().zip(values) { for (cnt_idx, opt_val_self) in &mut ca_iter { @@ -172,7 +172,7 @@ impl ChunkedSet for &BooleanChunked { check_bounds(idx, self.len() as IdxSize)?; check_sorted(idx)?; let mut ca_iter = self.into_iter().enumerate(); - let mut builder = BooleanChunkedBuilder::new(self.name(), self.len()); + let mut builder = BooleanChunkedBuilder::new(self.name().clone(), self.len()); for (current_idx, current_value) in idx.iter().zip(values) { for (cnt_idx, opt_val_self) in &mut ca_iter { diff --git a/crates/polars-ops/src/chunked_array/strings/concat.rs b/crates/polars-ops/src/chunked_array/strings/concat.rs index 67d1f244843d..bef1766b4089 100644 --- a/crates/polars-ops/src/chunked_array/strings/concat.rs +++ b/crates/polars-ops/src/chunked_array/strings/concat.rs @@ -6,17 +6,17 @@ use polars_core::prelude::*; // Vertically concatenate all strings in a StringChunked. pub fn str_join(ca: &StringChunked, delimiter: &str, ignore_nulls: bool) -> StringChunked { if ca.is_empty() { - return StringChunked::new(ca.name(), &[""]); + return StringChunked::new(ca.name().clone(), &[""]); } // Propagate null value. if !ignore_nulls && ca.null_count() != 0 { - return StringChunked::full_null(ca.name(), 1); + return StringChunked::full_null(ca.name().clone(), 1); } // Fast path for all nulls. if ignore_nulls && ca.null_count() == ca.len() { - return StringChunked::new(ca.name(), &[""]); + return StringChunked::new(ca.name().clone(), &[""]); } if ca.len() == 1 { @@ -44,7 +44,7 @@ pub fn str_join(ca: &StringChunked, delimiter: &str, ignore_nulls: bool) -> Stri let arr = unsafe { Utf8Array::from_data_unchecked_default(offsets.into(), buf.into(), None) }; // conversion is cheap with one value. let arr = utf8_to_utf8view(&arr); - StringChunked::with_chunk(ca.name(), arr) + StringChunked::with_chunk(ca.name().clone(), arr) } enum ColumnIter { @@ -61,7 +61,7 @@ pub fn hor_str_concat( ignore_nulls: bool, ) -> PolarsResult { if cas.is_empty() { - return Ok(StringChunked::full_null("", 0)); + return Ok(StringChunked::full_null(PlSmallStr::const_default(), 0)); } if cas.len() == 1 { let ca = cas[0]; @@ -84,7 +84,7 @@ pub fn hor_str_concat( ComputeError: "all series in `hor_str_concat` should have equal or unit length" ); - let mut builder = StringChunkedBuilder::new(cas[0].name(), len); + let mut builder = StringChunkedBuilder::new(cas[0].name().clone(), len); // Broadcast if appropriate. let mut cols: Vec<_> = cas @@ -141,7 +141,7 @@ mod test { #[test] fn test_str_concat() { - let ca = Int32Chunked::new("foo", &[Some(1), None, Some(3)]); + let ca = Int32Chunked::new("foo".into(), &[Some(1), None, Some(3)]); let ca_str = ca.cast(&DataType::String).unwrap(); let out = str_join(ca_str.str().unwrap(), "-", true); @@ -151,13 +151,13 @@ mod test { #[test] fn test_hor_str_concat() { - let a = StringChunked::new("a", &["foo", "bar"]); - let b = StringChunked::new("b", &["spam", "ham"]); + let a = StringChunked::new("a".into(), &["foo", "bar"]); + let b = StringChunked::new("b".into(), &["spam", "ham"]); let out = hor_str_concat(&[&a, &b], "_", true).unwrap(); assert_eq!(Vec::from(&out), &[Some("foo_spam"), Some("bar_ham")]); - let c = StringChunked::new("b", &["literal"]); + let c = StringChunked::new("b".into(), &["literal"]); let out = hor_str_concat(&[&a, &b, &c], "_", true).unwrap(); assert_eq!( Vec::from(&out), diff --git a/crates/polars-ops/src/chunked_array/strings/extract.rs b/crates/polars-ops/src/chunked_array/strings/extract.rs index 9663b8d04aae..a80820969612 100644 --- a/crates/polars-ops/src/chunked_array/strings/extract.rs +++ b/crates/polars-ops/src/chunked_array/strings/extract.rs @@ -48,8 +48,11 @@ pub(super) fn extract_groups( let reg = Regex::new(pat)?; let n_fields = reg.captures_len(); if n_fields == 1 { - return StructChunked::from_series(ca.name(), &[Series::new_null(ca.name(), ca.len())]) - .map(|ca| ca.into_series()); + return StructChunked::from_series( + ca.name().clone(), + &[Series::new_null(ca.name().clone(), ca.len())], + ) + .map(|ca| ca.into_series()); } let data_type = dtype.try_to_arrow(CompatLevel::newest())?; @@ -66,7 +69,7 @@ pub(super) fn extract_groups( .map(|array| extract_groups_array(array, ®, &names, data_type.clone())) .collect::>>()?; - Series::try_from((ca.name(), chunks)) + Series::try_from((ca.name().clone(), chunks)) } fn extract_group_reg_lit( @@ -153,21 +156,21 @@ pub(super) fn extract_group( let reg = Regex::new(pat)?; try_unary_mut_with_options(ca, |arr| extract_group_reg_lit(arr, ®, group_index)) } else { - Ok(StringChunked::full_null(ca.name(), ca.len())) + Ok(StringChunked::full_null(ca.name().clone(), ca.len())) } }, (1, _) => { if let Some(s) = ca.get(0) { try_unary_mut_with_options(pat, |pat| extract_group_array_lit(s, pat, group_index)) } else { - Ok(StringChunked::full_null(ca.name(), pat.len())) + Ok(StringChunked::full_null(ca.name().clone(), pat.len())) } }, (len_ca, len_pat) if len_ca == len_pat => try_binary_mut_with_options( ca, pat, |ca, pat| extract_group_binary(ca, pat, group_index), - ca.name(), + ca.name().clone(), ), _ => { polars_bail!(ComputeError: "ca(len: {}) and pat(len: {}) should either broadcast or have the same length", ca.len(), pat.len()) diff --git a/crates/polars-ops/src/chunked_array/strings/find_many.rs b/crates/polars-ops/src/chunked_array/strings/find_many.rs index 9bf0510e93d9..d56d8b3e014d 100644 --- a/crates/polars-ops/src/chunked_array/strings/find_many.rs +++ b/crates/polars-ops/src/chunked_array/strings/find_many.rs @@ -80,7 +80,8 @@ pub fn extract_many( ) -> PolarsResult { match patterns.dtype() { DataType::List(inner) if inner.is_string() => { - let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.len() * 2); + let mut builder = + ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.len() * 2); let patterns = patterns.list().unwrap(); let (ca, patterns) = align_chunks_binary(ca, patterns); @@ -101,7 +102,8 @@ pub fn extract_many( DataType::String => { let patterns = patterns.str().unwrap(); let ac = build_ac(patterns, ascii_case_insensitive)?; - let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.len() * 2); + let mut builder = + ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.len() * 2); for arr in ca.downcast_iter() { for opt_val in arr.into_iter() { diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index ba2124e4a0be..ab36c23b432d 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -54,7 +54,7 @@ pub trait Utf8JsonPathImpl: AsString { )?; unary_elementwise(ca, |opt_s| opt_s.and_then(|s| extract_json(&pat, s))) } else { - StringChunked::full_null(ca.name(), ca.len()) + StringChunked::full_null(ca.name().clone(), ca.len()) }; Ok(out) }, @@ -112,7 +112,7 @@ pub trait Utf8JsonPathImpl: AsString { ca.len(), ) .map_err(|e| polars_err!(ComputeError: "error deserializing JSON: {}", e))?; - Series::try_from(("", array)) + Series::try_from((PlSmallStr::const_default(), array)) } fn json_path_select(&self, json_path: &str) -> PolarsResult { @@ -167,7 +167,7 @@ mod tests { #[test] fn test_json_infer() { let s = Series::new( - "json", + "json".into(), [ None, Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#), @@ -177,10 +177,10 @@ mod tests { ); let ca = s.str().unwrap(); - let inner_dtype = DataType::Struct(vec![Field::new("c", DataType::Int64)]); + let inner_dtype = DataType::Struct(vec![Field::new("c".into(), DataType::Int64)]); let expected_dtype = DataType::Struct(vec![ - Field::new("a", DataType::Int64), - Field::new("b", DataType::List(Box::new(inner_dtype))), + Field::new("a".into(), DataType::Int64), + Field::new("b".into(), DataType::List(Box::new(inner_dtype))), ]); assert_eq!(ca.json_infer(None).unwrap(), expected_dtype); @@ -192,7 +192,7 @@ mod tests { #[test] fn test_json_decode() { let s = Series::new( - "json", + "json".into(), [ None, Some(r#"{"a": 1, "b": "hello"}"#), @@ -203,14 +203,14 @@ mod tests { let ca = s.str().unwrap(); let expected_series = StructChunked::from_series( - "", + "".into(), &[ - Series::new("a", &[None, Some(1), Some(2), None]), - Series::new("b", &[None, Some("hello"), Some("goodbye"), None]), + Series::new("a".into(), &[None, Some(1), Some(2), None]), + Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]), ], ) .unwrap() - .with_outer_validity_chunked(BooleanChunked::new("", [false, true, true, false])) + .with_outer_validity_chunked(BooleanChunked::new("".into(), [false, true, true, false])) .into_series(); let expected_dtype = expected_series.dtype().clone(); @@ -227,7 +227,7 @@ mod tests { #[test] fn test_json_path_select() { let s = Series::new( - "json", + "json".into(), [ None, Some(r#"{"a":1,"b":[{"c":0},{"c":1}]}"#), @@ -244,7 +244,7 @@ mod tests { .equals_missing(&s)); let b_series = Series::new( - "json", + "json".into(), [ None, Some(r#"[{"c":0},{"c":1}]"#), @@ -258,7 +258,10 @@ mod tests { .into_series() .equals_missing(&b_series)); - let c_series = Series::new("json", [None, Some(r#"[0,1]"#), Some(r#"[2,5]"#), None]); + let c_series = Series::new( + "json".into(), + [None, Some(r#"[0,1]"#), Some(r#"[2,5]"#), None], + ); assert!(ca .json_path_select("$.b[:].c") .unwrap() @@ -269,7 +272,7 @@ mod tests { #[test] fn test_json_path_extract() { let s = Series::new( - "json", + "json".into(), [ None, Some(r#"{"a":1,"b":[{"c":0},{"c":1}]}"#), @@ -280,11 +283,11 @@ mod tests { let ca = s.str().unwrap(); let c_series = Series::new( - "", + "".into(), [ None, - Some(Series::new("", &[0, 1])), - Some(Series::new("", &[2, 5])), + Some(Series::new("".into(), &[0, 1])), + Some(Series::new("".into(), &[2, 5])), None, ], ); diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index b9c1e3041967..1f2899764e4f 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -133,10 +133,10 @@ pub trait StringNameSpaceImpl: AsString { ca.contains(pat, strict) } }, - None => Ok(BooleanChunked::full_null(ca.name(), ca.len())), + None => Ok(BooleanChunked::full_null(ca.name().clone(), ca.len())), }, (1, _) if ca.null_count() == 1 => Ok(BooleanChunked::full_null( - ca.name(), + ca.name().clone(), ca.len().max(pat.len()), )), _ => { @@ -188,10 +188,13 @@ pub trait StringNameSpaceImpl: AsString { ca.find(pat, strict) } } else { - Ok(UInt32Chunked::full_null(ca.name(), ca.len())) + Ok(UInt32Chunked::full_null(ca.name().clone(), ca.len())) }; } else if ca.len() == 1 && ca.null_count() == 1 { - return Ok(UInt32Chunked::full_null(ca.name(), ca.len().max(pat.len()))); + return Ok(UInt32Chunked::full_null( + ca.name().clone(), + ca.len().max(pat.len()), + )); } if literal { Ok(broadcast_binary_elementwise( @@ -267,7 +270,7 @@ pub trait StringNameSpaceImpl: AsString { let out: BooleanChunked = if let Some(reg) = opt_reg { unary_elementwise_values(ca, |s| reg.is_match(s)) } else { - BooleanChunked::full_null(ca.name(), ca.len()) + BooleanChunked::full_null(ca.name().clone(), ca.len()) }; Ok(out) } @@ -292,7 +295,7 @@ pub trait StringNameSpaceImpl: AsString { Ok(rx) => Ok(unary_elementwise(ca, |opt_s| { opt_s.and_then(|s| rx.find(s)).map(|m| m.start() as u32) })), - Err(_) if !strict => Ok(UInt32Chunked::full_null(ca.name(), ca.len())), + Err(_) if !strict => Ok(UInt32Chunked::full_null(ca.name().clone(), ca.len())), Err(e) => Err(PolarsError::ComputeError( format!("Invalid regular expression: {}", e).into(), )), @@ -402,7 +405,8 @@ pub trait StringNameSpaceImpl: AsString { let ca = self.as_string(); let reg = Regex::new(pat)?; - let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); + let mut builder = + ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size()); for arr in ca.downcast_iter() { for opt_s in arr { match opt_s { @@ -495,7 +499,8 @@ pub trait StringNameSpaceImpl: AsString { // A sqrt(n) regex cache is not too small, not too large. let mut reg_cache = FastFixedCache::new((ca.len() as f64).sqrt() as usize); - let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); + let mut builder = + ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size()); binary_elementwise_for_each(ca, pat, |opt_s, opt_pat| match (opt_s, opt_pat) { (_, None) | (None, _) => builder.append_null(), (Some(s), Some(pat)) => { @@ -560,7 +565,7 @@ pub trait StringNameSpaceImpl: AsString { let out: UInt32Chunked = broadcast_try_binary_elementwise(ca, pat, op)?; - Ok(out.with_name(ca.name())) + Ok(out.with_name(ca.name().clone())) } /// Modify the strings to their lowercase equivalent. diff --git a/crates/polars-ops/src/chunked_array/strings/split.rs b/crates/polars-ops/src/chunked_array/strings/split.rs index 1902f6acf10b..d86e0efac2ae 100644 --- a/crates/polars-ops/src/chunked_array/strings/split.rs +++ b/crates/polars-ops/src/chunked_array/strings/split.rs @@ -65,6 +65,8 @@ where F: Fn(&'a str, &'a str) -> I, I: Iterator, { + use polars_utils::format_pl_smallstr; + let mut arrs = (0..n) .map(|_| MutableUtf8Array::::with_capacity(ca.len())) .collect::>(); @@ -143,11 +145,11 @@ where .into_iter() .enumerate() .map(|(i, mut arr)| { - Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap() + Series::try_from((format_pl_smallstr!("field_{i}"), arr.as_box())).unwrap() }) .collect::>(); - StructChunked::from_series(ca.name(), &fields) + StructChunked::from_series(ca.name().clone(), &fields) } pub fn split_helper<'a, F, I>(ca: &'a StringChunked, by: &'a StringChunked, op: F) -> ListChunked @@ -158,7 +160,7 @@ where if by.len() == 1 { if let Some(by) = by.get(0) { let mut builder = - ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); + ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size()); if by.is_empty() { ca.for_each(|opt_s| match opt_s { @@ -173,10 +175,11 @@ where } builder.finish() } else { - ListChunked::full_null_with_dtype(ca.name(), ca.len(), &DataType::String) + ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String) } } else { - let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); + let mut builder = + ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size()); binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) { (Some(s), Some(by)) => { diff --git a/crates/polars-ops/src/chunked_array/strings/strip.rs b/crates/polars-ops/src/chunked_array/strings/strip.rs index c7468d238807..cd92704d6bfe 100644 --- a/crates/polars-ops/src/chunked_array/strings/strip.rs +++ b/crates/polars-ops/src/chunked_array/strings/strip.rs @@ -124,7 +124,7 @@ pub fn strip_prefix(ca: &StringChunked, prefix: &StringChunked) -> StringChunked Some(prefix) => unary_elementwise(ca, |opt_s| { opt_s.map(|s| s.strip_prefix(prefix).unwrap_or(s)) }), - _ => StringChunked::full_null(ca.name(), ca.len()), + _ => StringChunked::full_null(ca.name().clone(), ca.len()), }, _ => broadcast_binary_elementwise(ca, prefix, strip_prefix_binary), } @@ -136,7 +136,7 @@ pub fn strip_suffix(ca: &StringChunked, suffix: &StringChunked) -> StringChunked Some(suffix) => unary_elementwise(ca, |opt_s| { opt_s.map(|s| s.strip_suffix(suffix).unwrap_or(s)) }), - _ => StringChunked::full_null(ca.name(), ca.len()), + _ => StringChunked::full_null(ca.name().clone(), ca.len()), }, _ => broadcast_binary_elementwise(ca, suffix, strip_suffix_binary), } diff --git a/crates/polars-ops/src/chunked_array/strings/substring.rs b/crates/polars-ops/src/chunked_array/strings/substring.rs index c9512f11bb2c..41fed212d439 100644 --- a/crates/polars-ops/src/chunked_array/strings/substring.rs +++ b/crates/polars-ops/src/chunked_array/strings/substring.rs @@ -163,14 +163,14 @@ pub(super) fn substring( let str_val = ca.get(0); let offset = offset.get(0); unary_elementwise(length, |length| substring_ternary(str_val, offset, length)) - .with_name(ca.name()) + .with_name(ca.name().clone()) }, (_, 1, 1) => { let offset = offset.get(0); let length = length.get(0).unwrap_or(u64::MAX); let Some(offset) = offset else { - return StringChunked::full_null(ca.name(), ca.len()); + return StringChunked::full_null(ca.name().clone(), ca.len()); }; unsafe { @@ -184,7 +184,7 @@ pub(super) fn substring( let str_val = ca.get(0); let length = length.get(0); unary_elementwise(offset, |offset| substring_ternary(str_val, offset, length)) - .with_name(ca.name()) + .with_name(ca.name().clone()) }, (1, len_b, len_c) if len_b == len_c => { let str_val = ca.get(0); @@ -225,7 +225,7 @@ pub(super) fn head(ca: &StringChunked, n: &Int64Chunked) -> PolarsResult { let n = n.get(0); let Some(n) = n else { - return Ok(StringChunked::full_null(ca.name(), len)); + return Ok(StringChunked::full_null(ca.name().clone(), len)); }; Ok(unsafe { @@ -238,7 +238,7 @@ pub(super) fn head(ca: &StringChunked, n: &Int64Chunked) -> PolarsResult { let str_val = ca.get(0); - Ok(unary_elementwise(n, |n| head_binary(str_val, n)).with_name(ca.name())) + Ok(unary_elementwise(n, |n| head_binary(str_val, n)).with_name(ca.name().clone())) }, (a, b) => { polars_ensure!(a == b, ShapeMismatch: "lengths of arguments do not align in 'str.head' got length: {} for column: {}, got length: {} for argument 'n'", a, ca.name(), b); @@ -252,7 +252,7 @@ pub(super) fn tail(ca: &StringChunked, n: &Int64Chunked) -> PolarsResult { let n = n.get(0); let Some(n) = n else { - return Ok(StringChunked::full_null(ca.name(), len)); + return Ok(StringChunked::full_null(ca.name().clone(), len)); }; unsafe { ca.apply_views(|view, val| { @@ -264,7 +264,7 @@ pub(super) fn tail(ca: &StringChunked, n: &Int64Chunked) -> PolarsResult { let str_val = ca.get(0); - unary_elementwise(n, |n| tail_binary(str_val, n)).with_name(ca.name()) + unary_elementwise(n, |n| tail_binary(str_val, n)).with_name(ca.name().clone()) }, (a, b) => { polars_ensure!(a == b, ShapeMismatch: "lengths of arguments do not align in 'str.tail' got length: {} for column: {}, got length: {} for argument 'n'", a, ca.name(), b); diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index a5f0b0197e9f..61ccf86257e2 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -15,6 +15,7 @@ pub type ChunkJoinOptIds = Vec; #[cfg(not(feature = "chunked_ids"))] pub type ChunkJoinIds = Vec; +use polars_core::export::once_cell::sync::Lazy; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -23,7 +24,7 @@ use serde::{Deserialize, Serialize}; pub struct JoinArgs { pub how: JoinType, pub validation: JoinValidation, - pub suffix: Option, + pub suffix: Option, pub slice: Option<(i64, usize)>, pub join_nulls: bool, pub coalesce: JoinCoalesce, @@ -94,13 +95,14 @@ impl JoinArgs { self } - pub fn with_suffix(mut self, suffix: Option) -> Self { + pub fn with_suffix(mut self, suffix: Option) -> Self { self.suffix = suffix; self } - pub fn suffix(&self) -> &str { - self.suffix.as_deref().unwrap_or("_right") + pub fn suffix(&self) -> &PlSmallStr { + static DEFAULT: Lazy = Lazy::new(|| PlSmallStr::from_static("_right")); + self.suffix.as_ref().unwrap_or(&*DEFAULT) } } diff --git a/crates/polars-ops/src/frame/join/asof/default.rs b/crates/polars-ops/src/frame/join/asof/default.rs index c8c8c68094bf..a0ff5d114426 100644 --- a/crates/polars-ops/src/frame/join/asof/default.rs +++ b/crates/polars-ops/src/frame/join/asof/default.rs @@ -15,7 +15,7 @@ where F: FnMut(T::Physical<'a>, T::Physical<'a>) -> bool, { if left.len() == left.null_count() || right.len() == right.null_count() { - return IdxCa::full_null("", left.len()); + return IdxCa::full_null(PlSmallStr::const_default(), left.len()); } let mut out = vec![0; left.len()]; @@ -55,7 +55,7 @@ where } let bitmap = Bitmap::try_new(mask, out.len()).unwrap(); - IdxCa::from_vec_validity("", out, Some(bitmap)) + IdxCa::from_vec_validity(PlSmallStr::const_default(), out, Some(bitmap)) } fn join_asof_forward<'a, T, F>(left: &'a T::Array, right: &'a T::Array, filter: F) -> IdxCa diff --git a/crates/polars-ops/src/frame/join/asof/groups.rs b/crates/polars-ops/src/frame/join/asof/groups.rs index 3068effca15a..c6d221b43b4e 100644 --- a/crates/polars-ops/src/frame/join/asof/groups.rs +++ b/crates/polars-ops/src/frame/join/asof/groups.rs @@ -16,10 +16,10 @@ use polars_utils::aliases::PlRandomState; use polars_utils::hashing::{hash_to_partition, DirtyHash}; use polars_utils::idx_vec::IdxVec; use polars_utils::nulls::IsNull; +use polars_utils::pl_str::PlSmallStr; use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash}; use polars_utils::unitvec; use rayon::prelude::*; -use smartstring::alias::String as SmartString; use super::*; @@ -600,11 +600,11 @@ pub trait AsofJoinBy: IntoDf { other: &DataFrame, left_on: &Series, right_on: &Series, - left_by: Vec, - right_by: Vec, + left_by: Vec, + right_by: Vec, strategy: AsofStrategy, tolerance: Option>, - suffix: Option<&str>, + suffix: Option, slice: Option<(i64, usize)>, coalesce: bool, ) -> PolarsResult { @@ -678,8 +678,12 @@ pub trait AsofJoinBy: IntoDf { let left = self_df.clone(); // SAFETY: join tuples are in bounds. - let right_df = - unsafe { proj_other_df.take_unchecked(&IdxCa::with_chunk("", right_join_tuples)) }; + let right_df = unsafe { + proj_other_df.take_unchecked(&IdxCa::with_chunk( + PlSmallStr::const_default(), + right_join_tuples, + )) + }; _finish_join(left, right_df, suffix) } diff --git a/crates/polars-ops/src/frame/join/asof/mod.rs b/crates/polars-ops/src/frame/join/asof/mod.rs index 07fdd69c7399..71e813cdac39 100644 --- a/crates/polars-ops/src/frame/join/asof/mod.rs +++ b/crates/polars-ops/src/frame/join/asof/mod.rs @@ -5,9 +5,9 @@ use std::borrow::Cow; use default::*; pub use groups::AsofJoinBy; use polars_core::prelude::*; +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use smartstring::alias::String as SmartString; #[cfg(feature = "dtype-categorical")] use super::_check_categorical_src; @@ -152,9 +152,9 @@ pub struct AsOfOptions { /// - "5m" /// - "2h15m" /// - "1d6h" - pub tolerance_str: Option, - pub left_by: Option>, - pub right_by: Option>, + pub tolerance_str: Option, + pub left_by: Option>, + pub right_by: Option>, } fn check_asof_columns( @@ -212,7 +212,7 @@ pub trait AsofJoin: IntoDf { right_key: &Series, strategy: AsofStrategy, tolerance: Option>, - suffix: Option, + suffix: Option, slice: Option<(i64, usize)>, coalesce: bool, ) -> PolarsResult { @@ -284,7 +284,7 @@ pub trait AsofJoin: IntoDf { // SAFETY: join tuples are in bounds. let right_df = unsafe { other.take_unchecked(&take_idx) }; - _finish_join(left, right_df, suffix.as_deref()) + _finish_join(left, right_df, suffix) } } diff --git a/crates/polars-ops/src/frame/join/cross_join.rs b/crates/polars-ops/src/frame/join/cross_join.rs index 1e1b1bcba497..c4290e262627 100644 --- a/crates/polars-ops/src/frame/join/cross_join.rs +++ b/crates/polars-ops/src/frame/join/cross_join.rs @@ -1,5 +1,5 @@ use polars_core::utils::{concat_df_unchecked, CustomIterTools, NoNull}; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use super::*; @@ -99,7 +99,7 @@ pub trait CrossJoin: IntoDf { fn _cross_join_with_names( &self, other: &DataFrame, - names: &[SmartString], + names: &[PlSmallStr], ) -> PolarsResult { let (mut l_df, r_df) = self.cross_join_dfs(other, None, false)?; @@ -111,7 +111,7 @@ pub trait CrossJoin: IntoDf { .zip(names) .for_each(|(s, name)| { if s.name() != name { - s.rename(name); + s.rename(name.clone()); } }); } @@ -122,7 +122,7 @@ pub trait CrossJoin: IntoDf { fn cross_join( &self, other: &DataFrame, - suffix: Option<&str>, + suffix: Option, slice: Option<(i64, usize)>, ) -> PolarsResult { let (l_df, r_df) = self.cross_join_dfs(other, slice, true)?; diff --git a/crates/polars-ops/src/frame/join/dispatch_left_right.rs b/crates/polars-ops/src/frame/join/dispatch_left_right.rs index d8dd5396b1e2..f5c91de88a74 100644 --- a/crates/polars-ops/src/frame/join/dispatch_left_right.rs +++ b/crates/polars-ops/src/frame/join/dispatch_left_right.rs @@ -8,12 +8,12 @@ pub(super) fn left_join_from_series( s_right: &Series, args: JoinArgs, verbose: bool, - drop_names: Option<&[&str]>, + drop_names: Option>, ) -> PolarsResult { let (df_left, df_right) = materialize_left_join_from_series( left, right, s_left, s_right, &args, verbose, drop_names, )?; - _finish_join(df_left, df_right, args.suffix.as_deref()) + _finish_join(df_left, df_right, args.suffix) } pub(super) fn right_join_from_series( @@ -23,13 +23,13 @@ pub(super) fn right_join_from_series( s_right: &Series, args: JoinArgs, verbose: bool, - drop_names: Option<&[&str]>, + drop_names: Option>, ) -> PolarsResult { // Swap the order of tables to do a right join. let (df_right, df_left) = materialize_left_join_from_series( right, left, s_right, s_left, &args, verbose, drop_names, )?; - _finish_join(df_left, df_right, args.suffix.as_deref()) + _finish_join(df_left, df_right, args.suffix) } pub fn materialize_left_join_from_series( @@ -39,7 +39,7 @@ pub fn materialize_left_join_from_series( s_right: &Series, args: &JoinArgs, verbose: bool, - drop_names: Option<&[&str]>, + drop_names: Option>, ) -> PolarsResult<(DataFrame, DataFrame)> { #[cfg(feature = "dtype-categorical")] _check_categorical_src(s_left.dtype(), s_right.dtype())?; diff --git a/crates/polars-ops/src/frame/join/general.rs b/crates/polars-ops/src/frame/join/general.rs index 2e4d38e2af0d..5840b853425c 100644 --- a/crates/polars-ops/src/frame/join/general.rs +++ b/crates/polars-ops/src/frame/join/general.rs @@ -1,12 +1,14 @@ +use polars_utils::format_pl_smallstr; + use super::*; use crate::series::coalesce_series; -pub fn _join_suffix_name(name: &str, suffix: &str) -> String { - format!("{name}{suffix}") +pub fn _join_suffix_name(name: &str, suffix: &str) -> PlSmallStr { + format_pl_smallstr!("{name}{suffix}") } -fn get_suffix(suffix: Option<&str>) -> &str { - suffix.unwrap_or("_right") +fn get_suffix(suffix: Option) -> PlSmallStr { + suffix.unwrap_or_else(|| PlSmallStr::from_static("_right")) } /// Utility method to finish a join. @@ -14,7 +16,7 @@ fn get_suffix(suffix: Option<&str>) -> &str { pub fn _finish_join( mut df_left: DataFrame, mut df_right: DataFrame, - suffix: Option<&str>, + suffix: Option, ) -> PolarsResult { let mut left_names = PlHashSet::with_capacity(df_left.width()); @@ -32,8 +34,8 @@ pub fn _finish_join( let suffix = get_suffix(suffix); for name in rename_strs { - let new_name = _join_suffix_name(&name, suffix); - df_right.rename(&name, new_name.as_str()).map_err(|_| { + let new_name = _join_suffix_name(name.as_str(), suffix.as_str()); + df_right.rename(&name, new_name.clone()).map_err(|_| { polars_err!(Duplicate: "column with name '{}' already exists\n\n\ You may want to try:\n\ - renaming the column prior to joining\n\ @@ -48,9 +50,9 @@ pub fn _finish_join( pub fn _coalesce_full_join( mut df: DataFrame, - keys_left: &[&str], - keys_right: &[&str], - suffix: Option<&str>, + keys_left: &[PlSmallStr], + keys_right: &[PlSmallStr], + suffix: Option, df_left: &DataFrame, ) -> DataFrame { // No need to allocate the schema because we already @@ -67,14 +69,14 @@ pub fn _coalesce_full_join( // SAFETY: we maintain invariants. let columns = unsafe { df.get_columns_mut() }; - for (&l, &r) in keys_left.iter().zip(keys_right.iter()) { - let pos_l = schema.get_full(l).unwrap().0; + let suffix = get_suffix(suffix); + for (l, r) in keys_left.iter().zip(keys_right.iter()) { + let pos_l = schema.get_full(l.as_str()).unwrap().0; - let r = if l == r || schema_left.contains(r) { - let suffix = get_suffix(suffix); - Cow::Owned(_join_suffix_name(r, suffix)) + let r = if l == r || schema_left.contains(r.as_str()) { + _join_suffix_name(r.as_str(), suffix.as_str()) } else { - Cow::Borrowed(r) + r.clone() }; let pos_r = schema.get_full(&r).unwrap().0; diff --git a/crates/polars-ops/src/frame/join/hash_join/mod.rs b/crates/polars-ops/src/frame/join/hash_join/mod.rs index 65e6d0a56dce..f31fca3d5ea4 100644 --- a/crates/polars-ops/src/frame/join/hash_join/mod.rs +++ b/crates/polars-ops/src/frame/join/hash_join/mod.rs @@ -147,8 +147,8 @@ pub trait JoinDispatch: IntoDf { join_idx_l.slice(offset, len); join_idx_r.slice(offset, len); } - let idx_ca_l = IdxCa::with_chunk("", join_idx_l); - let idx_ca_r = IdxCa::with_chunk("", join_idx_r); + let idx_ca_l = IdxCa::with_chunk(PlSmallStr::const_default(), join_idx_l); + let idx_ca_r = IdxCa::with_chunk(PlSmallStr::const_default(), join_idx_r); // Take the left and right dataframes by join tuples let (df_left, df_right) = POOL.join( @@ -157,13 +157,13 @@ pub trait JoinDispatch: IntoDf { ); let coalesce = args.coalesce.coalesce(&JoinType::Full); - let out = _finish_join(df_left, df_right, args.suffix.as_deref()); + let out = _finish_join(df_left, df_right, args.suffix.clone()); if coalesce { Ok(_coalesce_full_join( out?, - &[s_left.name()], - &[s_right.name()], - args.suffix.as_deref(), + &[s_left.name().clone()], + &[s_right.name().clone()], + args.suffix.clone(), df_self, )) } else { diff --git a/crates/polars-ops/src/frame/join/merge_sorted.rs b/crates/polars-ops/src/frame/join/merge_sorted.rs index d368ef5f5159..5e307504563f 100644 --- a/crates/polars-ops/src/frame/join/merge_sorted.rs +++ b/crates/polars-ops/src/frame/join/merge_sorted.rs @@ -38,7 +38,7 @@ pub fn _merge_sorted_dfs( let out = merge_series(&lhs_phys, &rhs_phys, &merge_indicator)?; let mut out = out.cast(lhs.dtype()).unwrap(); - out.rename(lhs.name()); + out.rename(lhs.name().clone()); Ok(out) }) .collect::>()?; @@ -81,7 +81,7 @@ fn merge_series(lhs: &Series, rhs: &Series, merge_indicator: &[bool]) -> PolarsR .zip(rhs.fields_as_series()) .map(|(lhs, rhs)| merge_series(lhs, &rhs, merge_indicator)) .collect::>>()?; - StructChunked::from_series("", &new_fields) + StructChunked::from_series(PlSmallStr::const_default(), &new_fields) .unwrap() .into_series() }, diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index afa5166147e4..fca31d119f5b 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -90,7 +90,7 @@ pub trait DataFrameJoinOps: IntoDf { ) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { let df_left = self.to_df(); let selected_left = df_left.select_series(left_on)?; @@ -114,7 +114,7 @@ pub trait DataFrameJoinOps: IntoDf { #[cfg(feature = "cross_join")] if let JoinType::Cross = args.how { - return left_df.cross_join(other, args.suffix.as_deref(), args.slice); + return left_df.cross_join(other, args.suffix.clone(), args.slice); } // Clear literals if a frame is empty. Otherwise we could get an oob @@ -195,8 +195,8 @@ pub trait DataFrameJoinOps: IntoDf { Err(_) => { let (ca_left, ca_right) = make_categoricals_compatible(l.categorical()?, r.categorical()?)?; - *l = ca_left.into_series().with_name(l.name()); - *r = ca_right.into_series().with_name(r.name()); + *l = ca_left.into_series().with_name(l.name().clone()); + *r = ca_right.into_series().with_name(r.name().clone()); }, } } @@ -205,7 +205,8 @@ pub trait DataFrameJoinOps: IntoDf { if selected_left.len() == 1 { let s_left = &selected_left[0]; let s_right = &selected_right[0]; - let drop_names: Option<&[&str]> = if should_coalesce { None } else { Some(&[]) }; + let drop_names: Option> = + if should_coalesce { None } else { Some(vec![]) }; return match args.how { JoinType::Inner => left_df ._inner_join_from_series(other, s_left, s_right, args, _verbose, drop_names), @@ -254,7 +255,7 @@ pub trait DataFrameJoinOps: IntoDf { right_by, options.strategy, options.tolerance, - args.suffix.as_deref(), + args.suffix.clone(), args.slice, should_coalesce, ), @@ -282,9 +283,12 @@ pub trait DataFrameJoinOps: IntoDf { let rhs_keys = prepare_keys_multiple(&selected_right, args.join_nulls)?.into_series(); let drop_names = if should_coalesce { - Some(selected_right.iter().map(|s| s.name()).collect::>()) + selected_right + .iter() + .map(|s| s.name().clone()) + .collect::>() } else { - Some(vec![]) + vec![] }; // Multiple keys. @@ -297,7 +301,10 @@ pub trait DataFrameJoinOps: IntoDf { unreachable!() }, JoinType::Full => { - let names_left = selected_left.iter().map(|s| s.name()).collect::>(); + let names_left = selected_left + .iter() + .map(|s| s.name().clone()) + .collect::>(); args.coalesce = JoinCoalesce::KeepColumns; let suffix = args.suffix.clone(); let out = left_df._full_join_from_series(other, &lhs_keys, &rhs_keys, args); @@ -305,9 +312,9 @@ pub trait DataFrameJoinOps: IntoDf { if should_coalesce { Ok(_coalesce_full_join( out?, - &names_left, - drop_names.as_ref().unwrap(), - suffix.as_deref(), + names_left.as_slice(), + drop_names.as_slice(), + suffix.clone(), left_df, )) } else { @@ -320,7 +327,7 @@ pub trait DataFrameJoinOps: IntoDf { &rhs_keys, args, _verbose, - drop_names.as_deref(), + Some(drop_names), ), JoinType::Left => dispatch_left_right::left_join_from_series( left_df.clone(), @@ -329,7 +336,7 @@ pub trait DataFrameJoinOps: IntoDf { &rhs_keys, args, _verbose, - drop_names.as_deref(), + Some(drop_names), ), JoinType::Right => dispatch_left_right::right_join_from_series( left_df, @@ -338,7 +345,7 @@ pub trait DataFrameJoinOps: IntoDf { &rhs_keys, args, _verbose, - drop_names.as_deref(), + Some(drop_names), ), #[cfg(feature = "semi_anti_join")] JoinType::Anti | JoinType::Semi => self._join_impl( @@ -371,7 +378,7 @@ pub trait DataFrameJoinOps: IntoDf { ) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { self.join(other, left_on, right_on, JoinArgs::new(JoinType::Inner)) } @@ -414,7 +421,7 @@ pub trait DataFrameJoinOps: IntoDf { fn left_join(&self, other: &DataFrame, left_on: I, right_on: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { self.join(other, left_on, right_on, JoinArgs::new(JoinType::Left)) } @@ -432,7 +439,7 @@ pub trait DataFrameJoinOps: IntoDf { fn full_join(&self, other: &DataFrame, left_on: I, right_on: I) -> PolarsResult where I: IntoIterator, - S: AsRef, + S: Into, { self.join(other, left_on, right_on, JoinArgs::new(JoinType::Full)) } @@ -446,7 +453,7 @@ trait DataFrameJoinOpsPrivate: IntoDf { s_right: &Series, args: JoinArgs, verbose: bool, - drop_names: Option<&[&str]>, + drop_names: Option>, ) -> PolarsResult { let left_df = self.to_df(); #[cfg(feature = "dtype-categorical")] @@ -474,7 +481,7 @@ trait DataFrameJoinOpsPrivate: IntoDf { ._take_unchecked_slice(join_tuples_right, true) }, ); - _finish_join(df_left, df_right, args.suffix.as_deref()) + _finish_join(df_left, df_right, args.suffix.clone()) } } diff --git a/crates/polars-ops/src/frame/mod.rs b/crates/polars-ops/src/frame/mod.rs index 93b2af3dd272..5691919c8861 100644 --- a/crates/polars-ops/src/frame/mod.rs +++ b/crates/polars-ops/src/frame/mod.rs @@ -96,13 +96,16 @@ pub trait DataFrameOps: IntoDf { ) -> PolarsResult { let df = self.to_df(); - let set: PlHashSet<&str> = - PlHashSet::from_iter(columns.unwrap_or_else(|| df.get_column_names())); + let set: PlHashSet<&str> = if let Some(columns) = columns { + PlHashSet::from_iter(columns) + } else { + PlHashSet::from_iter(df.iter().map(|s| s.name().as_str())) + }; let cols = POOL.install(|| { df.get_columns() .par_iter() - .map(|s| match set.contains(s.name()) { + .map(|s| match set.contains(s.name().as_str()) { true => s.to_dummies(separator, drop_first), false => Ok(s.clone().into_frame()), }) diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index 8fedd2f1860b..d909b580f87b 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -8,6 +8,7 @@ use polars_core::frame::group_by::expr::PhysicalAggExpr; use polars_core::prelude::*; use polars_core::utils::_split_offsets; use polars_core::{downcast_as_macro_arg_physical, POOL}; +use polars_utils::format_pl_smallstr; pub use unpivot::UnpivotDF; const HASHMAP_INIT_SIZE: usize = 512; @@ -97,14 +98,11 @@ where I0: IntoIterator, I1: IntoIterator, I2: IntoIterator, - S0: AsRef, - S1: AsRef, - S2: AsRef, + S0: Into, + S1: Into, + S2: Into, { - let on = on - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); + let on = on.into_iter().map(Into::into).collect::>(); let (index, values) = assign_remaining_columns(pivot_df, &on, index, values)?; pivot_impl( pivot_df, @@ -136,20 +134,17 @@ where I0: IntoIterator, I1: IntoIterator, I2: IntoIterator, - S0: AsRef, - S1: AsRef, - S2: AsRef, + S0: Into, + S1: Into, + S2: Into, { - let on = on - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); + let on = on.into_iter().map(Into::into).collect::>(); let (index, values) = assign_remaining_columns(pivot_df, &on, index, values)?; pivot_impl( pivot_df, - &on, - &index, - &values, + on.as_slice(), + index.as_slice(), + values.as_slice(), agg_fn, sort_columns, true, @@ -164,39 +159,39 @@ where /// - At least one of `index` and `values` must be non-null. fn assign_remaining_columns( df: &DataFrame, - on: &[String], + on: &[PlSmallStr], index: Option, values: Option, -) -> PolarsResult<(Vec, Vec)> +) -> PolarsResult<(Vec, Vec)> where I1: IntoIterator, I2: IntoIterator, - S1: AsRef, - S2: AsRef, + S1: Into, + S2: Into, { match (index, values) { (Some(index), Some(values)) => { - let index = index.into_iter().map(|s| s.as_ref().to_string()).collect(); - let values = values.into_iter().map(|s| s.as_ref().to_string()).collect(); + let index = index.into_iter().map(Into::into).collect(); + let values = values.into_iter().map(Into::into).collect(); Ok((index, values)) }, (Some(index), None) => { - let index: Vec = index.into_iter().map(|s| s.as_ref().to_string()).collect(); + let index: Vec = index.into_iter().map(Into::into).collect(); let values = df .get_column_names() .into_iter() - .map(|s| s.to_string()) .filter(|c| !(index.contains(c) | on.contains(c))) + .cloned() .collect(); Ok((index, values)) }, (None, Some(values)) => { - let values: Vec = values.into_iter().map(|s| s.as_ref().to_string()).collect(); + let values: Vec = values.into_iter().map(Into::into).collect(); let index = df .get_column_names() .into_iter() - .map(|s| s.to_string()) .filter(|c| !(values.contains(c) | on.contains(c))) + .cloned() .collect(); Ok((index, values)) }, @@ -210,12 +205,12 @@ where fn pivot_impl( pivot_df: &DataFrame, // keys of the first group_by operation - on: &[String], + on: &[PlSmallStr], // these columns will be aggregated in the nested group_by - index: &[String], + index: &[PlSmallStr], // these columns will be used for a nested group_by // the rows of this nested group_by will be pivoted as header column values - values: &[String], + values: &[PlSmallStr], // aggregation function agg_fn: Option, sort_columns: bool, @@ -230,14 +225,14 @@ fn pivot_impl( }; if on.len() > 1 { let schema = Arc::new(pivot_df.schema()); - let binding = pivot_df.select_with_schema(on, &schema)?; + let binding = pivot_df.select_with_schema(on.iter().cloned(), &schema)?; let fields = binding.get_columns(); - let column = format!("{{\"{}\"}}", on.join("\",\"")); + let column = format_pl_smallstr!("{{\"{}\"}}", on.join("\",\"")); if schema.contains(column.as_str()) { polars_bail!(ComputeError: "cannot use column name {column} that \ already exists in the DataFrame. Please rename it prior to calling `pivot`.") } - let columns_struct = StructChunked::from_series(&column, fields) + let columns_struct = StructChunked::from_series(column.clone(), fields) .unwrap() .into_series(); let mut binding = pivot_df.clone(); @@ -266,9 +261,9 @@ fn pivot_impl( fn pivot_impl_single_column( pivot_df: &DataFrame, - index: &[String], - column: &str, - values: &[String], + index: &[PlSmallStr], + column: &PlSmallStr, + values: &[PlSmallStr], agg_fn: Option, sort_columns: bool, separator: Option<&str>, @@ -278,7 +273,7 @@ fn pivot_impl_single_column( let mut count = 0; let out: PolarsResult<()> = POOL.install(|| { let mut group_by = index.to_vec(); - group_by.push(column.to_string()); + group_by.push(column.clone()); let groups = pivot_df.group_by_stable(group_by)?.take_groups(); @@ -296,9 +291,13 @@ fn pivot_impl_single_column( let value_agg = unsafe { match &agg_fn { None => match value_col.len() > groups.len() { - true => polars_bail!(ComputeError: "found multiple elements in the same group, please specify an aggregation function"), + true => polars_bail!( + ComputeError: + "found multiple elements in the same group, \ + please specify an aggregation function" + ), false => value_col.agg_first(&groups), - } + }, Some(agg_fn) => match agg_fn { Sum => value_col.agg_sum(&groups), Min => value_col.agg_min(&groups), @@ -309,14 +308,14 @@ fn pivot_impl_single_column( Median => value_col.agg_median(&groups), Count => groups.group_count().into_series(), Expr(ref expr) => { - let name = expr.root_name()?; + let name = expr.root_name()?.clone(); let mut value_col = value_col.clone(); value_col.rename(name); let tmp_df = value_col.into_frame(); let mut aggregated = expr.evaluate(&tmp_df, &groups)?; - aggregated.rename(value_col_name); + aggregated.rename(value_col_name.clone()); aggregated - } + }, }, } }; diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs index ec6f6eec4792..51761df873b5 100644 --- a/crates/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -73,7 +73,9 @@ pub(super) fn position_aggregates( .map(|(i, opt_name)| { let offset = i * n_rows; let avs = &buf[offset..offset + n_rows]; - let name = opt_name.unwrap_or("null"); + let name = opt_name + .map(PlSmallStr::from_str) + .unwrap_or_else(|| PlSmallStr::from_static("null")); let out = match &phys_type { #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { @@ -166,7 +168,9 @@ where .map(|(i, opt_name)| { let offset = i * n_rows; let opt_values = &buf[offset..offset + n_rows]; - let name = opt_name.unwrap_or("null"); + let name = opt_name + .map(PlSmallStr::from_str) + .unwrap_or_else(|| PlSmallStr::from_static("null")); let out = ChunkedArray::::from_slice_options(name, opt_values).into_series(); unsafe { out.cast_unchecked(logical_type).unwrap() } }) @@ -293,7 +297,7 @@ pub(super) fn compute_col_idx( } fn compute_row_index<'a, T>( - index: &[String], + index: &[PlSmallStr], index_agg_physical: &'a ChunkedArray, count: usize, logical_type: &DataType, @@ -331,7 +335,7 @@ where .map(|(k, _)| Option::>::peel_total_ord(k)) .collect::>() .into_series(); - s.rename(&index[0]); + s.rename(index[0].clone()); let s = restore_logical_type(&s, logical_type); Some(vec![s]) }, @@ -342,7 +346,7 @@ where } fn compute_row_index_struct( - index: &[String], + index: &[PlSmallStr], index_agg: &Series, index_agg_physical: &BinaryOffsetChunked, count: usize, @@ -377,7 +381,7 @@ fn compute_row_index_struct( // SAFETY: `unique_indices` is filled with elements between // 0 and `index_agg.len() - 1`. let mut s = unsafe { index_agg.take_slice_unchecked(&unique_indices) }; - s.rename(&index[0]); + s.rename(index[0].clone()); Some(vec![s]) }, _ => None, @@ -389,7 +393,7 @@ fn compute_row_index_struct( // TODO! Also create a specialized version for numerics. pub(super) fn compute_row_idx( pivot_df: &DataFrame, - index: &[String], + index: &[PlSmallStr], groups: &GroupsProxy, count: usize, ) -> PolarsResult<(Vec, usize, Option>)> { @@ -452,7 +456,7 @@ pub(super) fn compute_row_idx( let row_index = match count { 0 => { let s = Series::new( - &index[0], + index[0].clone(), row_to_idx.into_iter().map(|(k, _)| k).collect::>(), ); let s = restore_logical_type(&s, index_s.dtype()); @@ -465,9 +469,11 @@ pub(super) fn compute_row_idx( }, } } else { - let binding = pivot_df.select(index)?; + let binding = pivot_df.select(index.iter().cloned())?; let fields = binding.get_columns(); - let index_struct_series = StructChunked::from_series("placeholder", fields)?.into_series(); + let index_struct_series = + StructChunked::from_series(PlSmallStr::from_static("placeholder"), fields)? + .into_series(); let index_agg = unsafe { index_struct_series.agg_first(groups) }; let index_agg_physical = index_agg.to_physical_repr(); let ca = index_agg_physical.struct_()?; diff --git a/crates/polars-ops/src/frame/pivot/unpivot.rs b/crates/polars-ops/src/frame/pivot/unpivot.rs index 289529d4b4f4..a9255bdede0e 100644 --- a/crates/polars-ops/src/frame/pivot/unpivot.rs +++ b/crates/polars-ops/src/frame/pivot/unpivot.rs @@ -1,6 +1,6 @@ use arrow::array::{MutableArray, MutablePlString}; use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked; -use polars_core::datatypes::{DataType, SmartString}; +use polars_core::datatypes::{DataType, PlSmallStr}; use polars_core::frame::DataFrame; use polars_core::prelude::{IntoVec, Series, UnpivotArgsIR}; use polars_core::utils::try_get_supertype; @@ -68,8 +68,8 @@ pub trait UnpivotDF: IntoDf { /// ``` fn unpivot(&self, on: I, index: J) -> PolarsResult where - I: IntoVec, - J: IntoVec, + I: IntoVec, + J: IntoVec, { let index = index.into_vec(); let on = on.into_vec(); @@ -87,8 +87,12 @@ pub trait UnpivotDF: IntoDf { let index = args.index; let mut on = args.on; - let variable_name = args.variable_name.as_deref().unwrap_or("variable"); - let value_name = args.value_name.as_deref().unwrap_or("value"); + let variable_name = args + .variable_name + .unwrap_or_else(|| PlSmallStr::from_static("variable")); + let value_name = args + .value_name + .unwrap_or_else(|| PlSmallStr::from_static("value")); if self_.get_columns().is_empty() { return DataFrame::new(vec![ @@ -113,7 +117,7 @@ pub trait UnpivotDF: IntoDf { return Ok(unsafe { DataFrame::new_no_checks(out) }); } - let index_set = PlHashSet::from_iter(index.iter().map(|s| s.as_str())); + let index_set = PlHashSet::from_iter(index.iter().cloned()); on = self_ .get_columns() .iter() @@ -121,7 +125,7 @@ pub trait UnpivotDF: IntoDf { if index_set.contains(s.name()) { None } else { - Some(s.name().into()) + Some(s.name().clone()) } }) .collect(); diff --git a/crates/polars-ops/src/series/ops/approx_unique.rs b/crates/polars-ops/src/series/ops/approx_unique.rs index 31093e06b77a..ab0ea5db8966 100644 --- a/crates/polars-ops/src/series/ops/approx_unique.rs +++ b/crates/polars-ops/src/series/ops/approx_unique.rs @@ -17,7 +17,7 @@ where ca.iter().for_each(|item| hllp.add(&item.to_total_ord())); let c = hllp.count() as IdxSize; - Ok(Series::new(ca.name(), &[c])) + Ok(Series::new(ca.name().clone(), &[c])) } fn dispatcher(s: &Series) -> PolarsResult { @@ -59,7 +59,7 @@ fn dispatcher(s: &Series) -> PolarsResult { /// /// use polars_core::prelude::*; /// -/// let s = Series::new("s", [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]); +/// let s = Series::new("s".into(), [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]); /// /// let approx_count = approx_n_unique(&s).unwrap(); /// println!("{}", approx_count); diff --git a/crates/polars-ops/src/series/ops/business.rs b/crates/polars-ops/src/series/ops/business.rs index 17090e9a0b19..eff3e2e8c0ba 100644 --- a/crates/polars-ops/src/series/ops/business.rs +++ b/crates/polars-ops/src/series/ops/business.rs @@ -55,7 +55,7 @@ pub fn business_day_count( ) }) } else { - Int32Chunked::full_null(start_dates.name(), start_dates.len()) + Int32Chunked::full_null(start_dates.name().clone(), start_dates.len()) } }, (1, _) => { @@ -70,7 +70,7 @@ pub fn business_day_count( ) }) } else { - Int32Chunked::full_null(start_dates.name(), end_dates.len()) + Int32Chunked::full_null(start_dates.name().clone(), end_dates.len()) } }, _ => binary_elementwise_values(start_dates, end_dates, |start_date, end_date| { @@ -223,7 +223,7 @@ pub fn add_business_days( )) })? } else { - Int32Chunked::full_null(start_dates.name(), start_dates.len()) + Int32Chunked::full_null(start_dates.name().clone(), start_dates.len()) } }, (1, _) => { @@ -241,7 +241,7 @@ pub fn add_business_days( ) }) } else { - Int32Chunked::full_null(start_dates.name(), n.len()) + Int32Chunked::full_null(start_dates.name().clone(), n.len()) } }, _ => try_binary_elementwise(start_dates, n, |opt_start_date, opt_n| { diff --git a/crates/polars-ops/src/series/ops/cum_agg.rs b/crates/polars-ops/src/series/ops/cum_agg.rs index bd498f5088e6..dab529796ddb 100644 --- a/crates/polars-ops/src/series/ops/cum_agg.rs +++ b/crates/polars-ops/src/series/ops/cum_agg.rs @@ -74,7 +74,7 @@ where false => ca.iter().scan(init, det_max).collect_trusted(), true => ca.iter().rev().scan(init, det_max).collect_reversed(), }; - out.with_name(ca.name()) + out.with_name(ca.name().clone()) } fn cum_min_numeric(ca: &ChunkedArray, reverse: bool) -> ChunkedArray @@ -87,7 +87,7 @@ where false => ca.iter().scan(init, det_min).collect_trusted(), true => ca.iter().rev().scan(init, det_min).collect_reversed(), }; - out.with_name(ca.name()) + out.with_name(ca.name().clone()) } fn cum_sum_numeric(ca: &ChunkedArray, reverse: bool) -> ChunkedArray @@ -100,7 +100,7 @@ where false => ca.iter().scan(init, det_sum).collect_trusted(), true => ca.iter().rev().scan(init, det_sum).collect_reversed(), }; - out.with_name(ca.name()) + out.with_name(ca.name().clone()) } fn cum_prod_numeric(ca: &ChunkedArray, reverse: bool) -> ChunkedArray @@ -113,7 +113,7 @@ where false => ca.iter().scan(init, det_prod).collect_trusted(), true => ca.iter().rev().scan(init, det_prod).collect_reversed(), }; - out.with_name(ca.name()) + out.with_name(ca.name().clone()) } /// Get an array with the cumulative product computed at every element. @@ -211,7 +211,7 @@ pub fn cum_max(s: &Series, reverse: bool) -> PolarsResult { pub fn cum_count(s: &Series, reverse: bool) -> PolarsResult { let mut out = if s.null_count() == 0 { // Fast paths for no nulls - cum_count_no_nulls(s.name(), s.len(), reverse) + cum_count_no_nulls(s.name().clone(), s.len(), reverse) } else { let ca = s.is_not_null(); let out: IdxCa = if reverse { @@ -242,7 +242,7 @@ pub fn cum_count(s: &Series, reverse: bool) -> PolarsResult { Ok(out) } -fn cum_count_no_nulls(name: &str, len: usize, reverse: bool) -> Series { +fn cum_count_no_nulls(name: PlSmallStr, len: usize, reverse: bool) -> Series { let start = 1 as IdxSize; let end = len as IdxSize + 1; let ca: NoNull = if reverse { diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index a999fac2a3a0..2deb6dfeb52f 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -1,16 +1,17 @@ use polars_core::prelude::*; +use polars_utils::format_pl_smallstr; fn map_cats( s: &Series, - labels: &[String], + labels: &[PlSmallStr], sorted_breaks: &[f64], left_closed: bool, include_breaks: bool, ) -> PolarsResult { - let out_name = "category"; + let out_name = PlSmallStr::from_static("category"); // Create new categorical and pre-register labels for consistent categorical indexes. - let mut bld = CategoricalChunkedBuilder::new(out_name, s.len(), Default::default()); + let mut bld = CategoricalChunkedBuilder::new(out_name.clone(), s.len(), Default::default()); for label in labels { bld.register_value(label); } @@ -33,7 +34,10 @@ fn map_cats( // returned a dataframe. That included a column of the right endpoint of the interval. So we // return a struct series instead which can be turned into a dataframe later. let right_ends = [sorted_breaks, &[f64::INFINITY]].concat(); - let mut brk_vals = PrimitiveChunkedBuilder::::new("breakpoint", s.len()); + let mut brk_vals = PrimitiveChunkedBuilder::::new( + PlSmallStr::from_static("breakpoint"), + s.len(), + ); s_iter .map(|opt| { opt.filter(|x| !x.is_nan()).map(|x| { @@ -74,7 +78,7 @@ fn map_cats( } } -pub fn compute_labels(breaks: &[f64], left_closed: bool) -> PolarsResult> { +pub fn compute_labels(breaks: &[f64], left_closed: bool) -> PolarsResult> { let lo = std::iter::once(&f64::NEG_INFINITY).chain(breaks.iter()); let hi = breaks.iter().chain(std::iter::once(&f64::INFINITY)); @@ -82,9 +86,9 @@ pub fn compute_labels(breaks: &[f64], left_closed: bool) -> PolarsResult PolarsResult, - labels: Option>, + labels: Option>, left_closed: bool, include_breaks: bool, ) -> PolarsResult { @@ -120,7 +124,7 @@ pub fn cut( pub fn qcut( s: &Series, probs: Vec, - labels: Option>, + labels: Option>, left_closed: bool, allow_duplicates: bool, include_breaks: bool, @@ -169,9 +173,9 @@ mod test { use super::map_cats; - let s = Series::new("x", &[1, 2, 3, 4, 5]); + let s = Series::new("x".into(), &[1, 2, 3, 4, 5]); - let labels = &["a", "b", "c"].map(str::to_owned); + let labels = &["a", "b", "c"].map(PlSmallStr::from_static); let breaks = &[2.0, 4.0]; let left_closed = false; diff --git a/crates/polars-ops/src/series/ops/duration.rs b/crates/polars-ops/src/series/ops/duration.rs index b13a7ea0894f..1d5868260e64 100644 --- a/crates/polars-ops/src/series/ops/duration.rs +++ b/crates/polars-ops/src/series/ops/duration.rs @@ -6,7 +6,7 @@ use polars_error::PolarsResult; pub fn impl_duration(s: &[Series], time_unit: TimeUnit) -> PolarsResult { if s.iter().any(|s| s.is_empty()) { return Ok(Series::new_empty( - s[0].name(), + s[0].name().clone(), &DataType::Duration(time_unit), )); } diff --git a/crates/polars-ops/src/series/ops/ewm.rs b/crates/polars-ops/src/series/ops/ewm.rs index 22b99a04a892..d6fa9c31a044 100644 --- a/crates/polars-ops/src/series/ops/ewm.rs +++ b/crates/polars-ops/src/series/ops/ewm.rs @@ -21,7 +21,7 @@ pub fn ewm_mean(s: &Series, options: EWMOptions) -> PolarsResult { options.min_periods, options.ignore_nulls, ); - Series::try_from((s.name(), Box::new(result) as ArrayRef)) + Series::try_from((s.name().clone(), Box::new(result) as ArrayRef)) }, DataType::Float64 => { let xs = s.f64().unwrap(); @@ -32,7 +32,7 @@ pub fn ewm_mean(s: &Series, options: EWMOptions) -> PolarsResult { options.min_periods, options.ignore_nulls, ); - Series::try_from((s.name(), Box::new(result) as ArrayRef)) + Series::try_from((s.name().clone(), Box::new(result) as ArrayRef)) }, _ => ewm_mean(&s.cast(&DataType::Float64)?, options), } @@ -51,7 +51,7 @@ pub fn ewm_std(s: &Series, options: EWMOptions) -> PolarsResult { options.min_periods, options.ignore_nulls, ); - Series::try_from((s.name(), Box::new(result) as ArrayRef)) + Series::try_from((s.name().clone(), Box::new(result) as ArrayRef)) }, DataType::Float64 => { let xs = s.f64().unwrap(); @@ -63,7 +63,7 @@ pub fn ewm_std(s: &Series, options: EWMOptions) -> PolarsResult { options.min_periods, options.ignore_nulls, ); - Series::try_from((s.name(), Box::new(result) as ArrayRef)) + Series::try_from((s.name().clone(), Box::new(result) as ArrayRef)) }, _ => ewm_std(&s.cast(&DataType::Float64)?, options), } @@ -82,7 +82,7 @@ pub fn ewm_var(s: &Series, options: EWMOptions) -> PolarsResult { options.min_periods, options.ignore_nulls, ); - Series::try_from((s.name(), Box::new(result) as ArrayRef)) + Series::try_from((s.name().clone(), Box::new(result) as ArrayRef)) }, DataType::Float64 => { let xs = s.f64().unwrap(); @@ -94,7 +94,7 @@ pub fn ewm_var(s: &Series, options: EWMOptions) -> PolarsResult { options.min_periods, options.ignore_nulls, ); - Series::try_from((s.name(), Box::new(result) as ArrayRef)) + Series::try_from((s.name().clone(), Box::new(result) as ArrayRef)) }, _ => ewm_var(&s.cast(&DataType::Float64)?, options), } diff --git a/crates/polars-ops/src/series/ops/ewm_by.rs b/crates/polars-ops/src/series/ops/ewm_by.rs index 9ae0db056ae5..fe79710ab9bf 100644 --- a/crates/polars-ops/src/series/ops/ewm_by.rs +++ b/crates/polars-ops/src/series/ops/ewm_by.rs @@ -135,7 +135,7 @@ where let validity = binary_concatenate_validities(times, values); arr = arr.with_validity_typed(validity); } - ChunkedArray::with_chunk(values.name(), arr) + ChunkedArray::with_chunk(values.name().clone(), arr) } /// Fastpath if `times` is known to already be sorted. @@ -184,7 +184,7 @@ where let validity = binary_concatenate_validities(times, values); arr = arr.with_validity_typed(validity); } - ChunkedArray::with_chunk(values.name(), arr) + ChunkedArray::with_chunk(values.name().clone(), arr) } fn adjust_half_life_to_time_unit(half_life: i64, time_unit: &TimeUnit) -> i64 { diff --git a/crates/polars-ops/src/series/ops/fused.rs b/crates/polars-ops/src/series/ops/fused.rs index 86c8b5656fe0..16b06f76c479 100644 --- a/crates/polars-ops/src/series/ops/fused.rs +++ b/crates/polars-ops/src/series/ops/fused.rs @@ -38,7 +38,7 @@ fn fma_ca( .zip(b.downcast_iter()) .zip(c.downcast_iter()) .map(|((a, b), c)| fma_arr(a, b, c)); - ChunkedArray::from_chunk_iter(a.name(), chunks) + ChunkedArray::from_chunk_iter(a.name().clone(), chunks) } pub fn fma_series(a: &Series, b: &Series, c: &Series) -> Series { @@ -89,7 +89,7 @@ fn fsm_ca( .zip(b.downcast_iter()) .zip(c.downcast_iter()) .map(|((a, b), c)| fsm_arr(a, b, c)); - ChunkedArray::from_chunk_iter(a.name(), chunks) + ChunkedArray::from_chunk_iter(a.name().clone(), chunks) } pub fn fsm_series(a: &Series, b: &Series, c: &Series) -> Series { @@ -139,7 +139,7 @@ fn fms_ca( .zip(b.downcast_iter()) .zip(c.downcast_iter()) .map(|((a, b), c)| fms_arr(a, b, c)); - ChunkedArray::from_chunk_iter(a.name(), chunks) + ChunkedArray::from_chunk_iter(a.name().clone(), chunks) } pub fn fms_series(a: &Series, b: &Series, c: &Series) -> Series { diff --git a/crates/polars-ops/src/series/ops/horizontal.rs b/crates/polars-ops/src/series/ops/horizontal.rs index c8e3488aab93..4412e2aa21d1 100644 --- a/crates/polars-ops/src/series/ops/horizontal.rs +++ b/crates/polars-ops/src/series/ops/horizontal.rs @@ -4,25 +4,25 @@ use polars_core::prelude::*; pub fn max_horizontal(s: &[Series]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.max_horizontal() - .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name()))) + .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn min_horizontal(s: &[Series]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.min_horizontal() - .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name()))) + .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn sum_horizontal(s: &[Series]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.sum_horizontal(NullStrategy::Ignore) - .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name()))) + .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn mean_horizontal(s: &[Series]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.mean_horizontal(NullStrategy::Ignore) - .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name()))) + .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn coalesce_series(s: &[Series]) -> PolarsResult { diff --git a/crates/polars-ops/src/series/ops/int_range.rs b/crates/polars-ops/src/series/ops/int_range.rs index 4c68b2280635..5e5a3d419acb 100644 --- a/crates/polars-ops/src/series/ops/int_range.rs +++ b/crates/polars-ops/src/series/ops/int_range.rs @@ -5,7 +5,7 @@ pub fn new_int_range( start: T::Native, end: T::Native, step: i64, - name: &str, + name: PlSmallStr, ) -> PolarsResult where T: PolarsIntegerType, diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs index 11af19651fe0..36d9dc12e556 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs @@ -103,9 +103,9 @@ where out.into(), Some(validity.into()), ); - ChunkedArray::with_chunk(chunked_arr.name(), array) + ChunkedArray::with_chunk(chunked_arr.name().clone(), array) } else { - ChunkedArray::from_vec(chunked_arr.name(), out) + ChunkedArray::from_vec(chunked_arr.name().clone(), out) } } @@ -211,7 +211,7 @@ mod test { #[test] fn test_interpolate() { - let ca = UInt32Chunked::new("", &[Some(1), None, None, Some(4), Some(5)]); + let ca = UInt32Chunked::new("".into(), &[Some(1), None, None, Some(4), Some(5)]); let out = interpolate(&ca.into_series(), InterpolationMethod::Linear); let out = out.f64().unwrap(); assert_eq!( @@ -219,7 +219,7 @@ mod test { &[Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ); - let ca = UInt32Chunked::new("", &[None, Some(1), None, None, Some(4), Some(5)]); + let ca = UInt32Chunked::new("".into(), &[None, Some(1), None, None, Some(4), Some(5)]); let out = interpolate(&ca.into_series(), InterpolationMethod::Linear); let out = out.f64().unwrap(); assert_eq!( @@ -227,7 +227,10 @@ mod test { &[None, Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ); - let ca = UInt32Chunked::new("", &[None, Some(1), None, None, Some(4), Some(5), None]); + let ca = UInt32Chunked::new( + "".into(), + &[None, Some(1), None, None, Some(4), Some(5), None], + ); let out = interpolate(&ca.into_series(), InterpolationMethod::Linear); let out = out.f64().unwrap(); assert_eq!( @@ -242,7 +245,10 @@ mod test { None ] ); - let ca = UInt32Chunked::new("", &[None, Some(1), None, None, Some(4), Some(5), None]); + let ca = UInt32Chunked::new( + "".into(), + &[None, Some(1), None, None, Some(4), Some(5), None], + ); let out = interpolate(&ca.into_series(), InterpolationMethod::Nearest); let out = out.u32().unwrap(); assert_eq!( @@ -253,7 +259,7 @@ mod test { #[test] fn test_interpolate_decreasing_unsigned() { - let ca = UInt32Chunked::new("", &[Some(4), None, None, Some(1)]); + let ca = UInt32Chunked::new("".into(), &[Some(4), None, None, Some(1)]); let out = interpolate(&ca.into_series(), InterpolationMethod::Linear); let out = out.f64().unwrap(); assert_eq!( @@ -265,7 +271,7 @@ mod test { #[test] fn test_interpolate2() { let ca = Float32Chunked::new( - "", + "".into(), &[ Some(4653f32), None, diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs index c77d2ad6f157..06a8378055da 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs @@ -155,9 +155,9 @@ where out.into(), Some(validity.into()), ); - Ok(ChunkedArray::with_chunk(chunked_arr.name(), array)) + Ok(ChunkedArray::with_chunk(chunked_arr.name().clone(), array)) } else { - Ok(ChunkedArray::from_vec(chunked_arr.name(), out)) + Ok(ChunkedArray::from_vec(chunked_arr.name().clone(), out)) } } @@ -257,9 +257,9 @@ where out.into(), Some(validity.into()), ); - Ok(ChunkedArray::with_chunk(ca_sorted.name(), array)) + Ok(ChunkedArray::with_chunk(ca_sorted.name().clone(), array)) } else { - Ok(ChunkedArray::from_vec(ca_sorted.name(), out)) + Ok(ChunkedArray::from_vec(ca_sorted.name().clone(), out)) } } diff --git a/crates/polars-ops/src/series/ops/is_first_distinct.rs b/crates/polars-ops/src/series/ops/is_first_distinct.rs index d3440340d9a7..4fdb10e162c3 100644 --- a/crates/polars-ops/src/series/ops/is_first_distinct.rs +++ b/crates/polars-ops/src/series/ops/is_first_distinct.rs @@ -20,7 +20,7 @@ where .collect_trusted() }); - BooleanChunked::from_chunk_iter(ca.name(), chunks) + BooleanChunked::from_chunk_iter(ca.name().clone(), chunks) } fn is_first_distinct_bin(ca: &BinaryChunked) -> BooleanChunked { @@ -31,7 +31,7 @@ fn is_first_distinct_bin(ca: &BinaryChunked) -> BooleanChunked { .collect_trusted() }); - BooleanChunked::from_chunk_iter(ca.name(), chunks) + BooleanChunked::from_chunk_iter(ca.name().clone(), chunks) } fn is_first_distinct_boolean(ca: &BooleanChunked) -> BooleanChunked { @@ -69,7 +69,7 @@ fn is_first_distinct_boolean(ca: &BooleanChunked) -> BooleanChunked { } } let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None); - BooleanChunked::with_chunk(ca.name(), arr) + BooleanChunked::with_chunk(ca.name().clone(), arr) } #[cfg(feature = "dtype-struct")] @@ -85,7 +85,7 @@ fn is_first_distinct_struct(s: &Series) -> PolarsResult { } let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None); - Ok(BooleanChunked::with_chunk(s.name(), arr)) + Ok(BooleanChunked::with_chunk(s.name().clone(), arr)) } fn is_first_distinct_list(ca: &ListChunked) -> PolarsResult { @@ -100,15 +100,15 @@ fn is_first_distinct_list(ca: &ListChunked) -> PolarsResult { } let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None); - Ok(BooleanChunked::with_chunk(ca.name(), arr)) + Ok(BooleanChunked::with_chunk(ca.name().clone(), arr)) } pub fn is_first_distinct(s: &Series) -> PolarsResult { // fast path. if s.len() == 0 { - return Ok(BooleanChunked::full_null(s.name(), 0)); + return Ok(BooleanChunked::full_null(s.name().clone(), 0)); } else if s.len() == 1 { - return Ok(BooleanChunked::new(s.name(), &[true])); + return Ok(BooleanChunked::new(s.name().clone(), &[true])); } let s = s.to_physical_repr(); diff --git a/crates/polars-ops/src/series/ops/is_in.rs b/crates/polars-ops/src/series/ops/is_in.rs index 3461b9789e2d..6c6c6eeb6c40 100644 --- a/crates/polars-ops/src/series/ops/is_in.rs +++ b/crates/polars-ops/src/series/ops/is_in.rs @@ -25,7 +25,10 @@ where } }) }); - Ok(unary_elementwise_values(ca, |val| set.contains(&val.to_total_ord())).with_name(ca.name())) + Ok( + unary_elementwise_values(ca, |val| set.contains(&val.to_total_ord())) + .with_name(ca.name().clone()), + ) } fn is_in_helper<'a, T>(ca: &'a ChunkedArray, other: &Series) -> PolarsResult @@ -70,7 +73,7 @@ where .collect_trusted() } }; - ca.rename(ca_in.name()); + ca.rename(ca_in.name().clone()); Ok(ca) } @@ -105,7 +108,7 @@ where }) .collect_trusted() }; - ca.rename(ca_in.name()); + ca.rename(ca_in.name().clone()); Ok(ca) } @@ -198,7 +201,7 @@ fn is_in_string_list_categorical( .collect() } }; - ca.rename(ca_in.name()); + ca.rename(ca_in.name().clone()); Ok(ca) } @@ -267,7 +270,7 @@ fn is_in_binary_list(ca_in: &BinaryChunked, other: &Series) -> PolarsResult PolarsResult PolarsResult PolarsResult PolarsResult polars_bail!(opq = is_in, ca_in.dtype(), other.dtype()), } @@ -449,7 +452,7 @@ fn is_in_struct_list(ca_in: &StructChunked, other: &Series) -> PolarsResult PolarsResult PolarsResult PolarsResult { // In case of fast unique, we can directly use the categories. Otherwise we need to // first get the unique physicals - let categories = StringChunked::with_chunk("", other.get_rev_map().get_categories().clone()); + let categories = StringChunked::with_chunk( + PlSmallStr::const_default(), + other.get_rev_map().get_categories().clone(), + ); let other = if other._can_fast_unique() { categories } else { @@ -624,7 +630,7 @@ fn is_in_cat(ca_in: &CategoricalChunked, other: &Series) -> PolarsResult PolarsResult PolarsResult { // fast path. if s.len() == 0 { - return Ok(BooleanChunked::full_null(s.name(), 0)); + return Ok(BooleanChunked::full_null(s.name().clone(), 0)); } else if s.len() == 1 { - return Ok(BooleanChunked::new(s.name(), &[true])); + return Ok(BooleanChunked::new(s.name().clone(), &[true])); } let s = s.to_physical_repr(); @@ -107,7 +107,7 @@ fn is_last_distinct_boolean(ca: &BooleanChunked) -> BooleanChunked { } let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None); - BooleanChunked::with_chunk(ca.name(), arr) + BooleanChunked::with_chunk(ca.name().clone(), arr) } fn is_last_distinct_bin(ca: &BinaryChunked) -> BooleanChunked { @@ -120,7 +120,7 @@ fn is_last_distinct_bin(ca: &BinaryChunked) -> BooleanChunked { .map(|opt_v| unique.insert(opt_v)) .collect_reversed::>() .into_inner(); - new_ca.rename(ca.name()); + new_ca.rename(ca.name().clone()); new_ca } @@ -139,7 +139,7 @@ where .map(|opt_v| unique.insert(opt_v.to_total_ord())) .collect_reversed::>() .into_inner(); - new_ca.rename(ca.name()); + new_ca.rename(ca.name().clone()); new_ca } @@ -157,7 +157,7 @@ fn is_last_distinct_struct(s: &Series) -> PolarsResult { } let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None); - Ok(BooleanChunked::with_chunk(s.name(), arr)) + Ok(BooleanChunked::with_chunk(s.name().clone(), arr)) } fn is_last_distinct_list(ca: &ListChunked) -> PolarsResult { @@ -173,5 +173,5 @@ fn is_last_distinct_list(ca: &ListChunked) -> PolarsResult { } let arr = BooleanArray::new(ArrowDataType::Boolean, out.into(), None); - Ok(BooleanChunked::with_chunk(ca.name(), arr)) + Ok(BooleanChunked::with_chunk(ca.name().clone(), arr)) } diff --git a/crates/polars-ops/src/series/ops/is_unique.rs b/crates/polars-ops/src/series/ops/is_unique.rs index 265e8736b35e..2f1d3de652ba 100644 --- a/crates/polars-ops/src/series/ops/is_unique.rs +++ b/crates/polars-ops/src/series/ops/is_unique.rs @@ -36,7 +36,7 @@ where unsafe { values.set_unchecked(idx as usize, setter) } } let arr = BooleanArray::from_data_default(values.into(), None); - BooleanChunked::with_chunk(ca.name(), arr) + BooleanChunked::with_chunk(ca.name().clone(), arr) } fn dispatcher(s: &Series, invert: bool) -> PolarsResult { @@ -75,9 +75,9 @@ fn dispatcher(s: &Series, invert: bool) -> PolarsResult { }; }, Null => match s.len() { - 0 => BooleanChunked::new(s.name(), [] as [bool; 0]), - 1 => BooleanChunked::new(s.name(), [!invert]), - len => BooleanChunked::full(s.name(), invert, len), + 0 => BooleanChunked::new(s.name().clone(), [] as [bool; 0]), + 1 => BooleanChunked::new(s.name().clone(), [!invert]), + len => BooleanChunked::full(s.name().clone(), invert, len), }, dt if dt.is_numeric() => { with_match_physical_integer_polars_type!(s.dtype(), |$T| { diff --git a/crates/polars-ops/src/series/ops/log.rs b/crates/polars-ops/src/series/ops/log.rs index 7b914071de80..6cd1fb07e974 100644 --- a/crates/polars-ops/src/series/ops/log.rs +++ b/crates/polars-ops/src/series/ops/log.rs @@ -92,7 +92,10 @@ pub trait LogSeries: SeriesSealed { let pk = s.as_ref(); let pk = if normalize { - let sum = pk.sum_reduce().unwrap().into_series(""); + let sum = pk + .sum_reduce() + .unwrap() + .into_series(PlSmallStr::const_default()); if sum.get(0).unwrap().extract::().unwrap() != 1.0 { (pk / &sum)? diff --git a/crates/polars-ops/src/series/ops/moment.rs b/crates/polars-ops/src/series/ops/moment.rs index be20c8ae981e..0381767e98b4 100644 --- a/crates/polars-ops/src/series/ops/moment.rs +++ b/crates/polars-ops/src/series/ops/moment.rs @@ -129,7 +129,7 @@ mod test { #[test] fn test_moment_compute() -> PolarsResult<()> { - let s = Series::new("", &[1, 2, 3, 4, 5, 23]); + let s = Series::new(PlSmallStr::const_default(), &[1, 2, 3, 4, 5, 23]); assert_eq!(moment(&s, 0)?, Some(1.0)); assert_eq!(moment(&s, 1)?, Some(0.0)); @@ -141,8 +141,11 @@ mod test { #[test] fn test_skew() -> PolarsResult<()> { - let s = Series::new("", &[1, 2, 3, 4, 5, 23]); - let s2 = Series::new("", &[Some(1), Some(2), Some(3), None, Some(1)]); + let s = Series::new(PlSmallStr::const_default(), &[1, 2, 3, 4, 5, 23]); + let s2 = Series::new( + PlSmallStr::const_default(), + &[Some(1), Some(2), Some(3), None, Some(1)], + ); assert!((s.skew(false)?.unwrap() - 2.2905330058490514).abs() < 0.0001); assert!((s.skew(true)?.unwrap() - 1.6727687946848508).abs() < 0.0001); @@ -155,7 +158,7 @@ mod test { #[test] fn test_kurtosis() -> PolarsResult<()> { - let s = Series::new("", &[1, 2, 3, 4, 5, 23]); + let s = Series::new(PlSmallStr::const_default(), &[1, 2, 3, 4, 5, 23]); assert!((s.kurtosis(true, true)?.unwrap() - 0.9945668771797536).abs() < 0.0001); assert!((s.kurtosis(true, false)?.unwrap() - 5.400820058440946).abs() < 0.0001); @@ -163,7 +166,7 @@ mod test { assert!((s.kurtosis(false, false)?.unwrap() - 8.400820058440946).abs() < 0.0001); let s2 = Series::new( - "", + PlSmallStr::const_default(), &[Some(1), Some(2), Some(3), None, Some(1), Some(2), Some(3)], ); assert!((s2.kurtosis(true, true)?.unwrap() - (-1.5)).abs() < 0.0001); diff --git a/crates/polars-ops/src/series/ops/pct_change.rs b/crates/polars-ops/src/series/ops/pct_change.rs index 56c7af142e9b..9cb45dac1d6f 100644 --- a/crates/polars-ops/src/series/ops/pct_change.rs +++ b/crates/polars-ops/src/series/ops/pct_change.rs @@ -20,6 +20,6 @@ pub fn pct_change(s: &Series, n: &Series) -> PolarsResult { if let Some(n) = n_s.i64()?.get(0) { diff(&fill_null_s, n, NullBehavior::Ignore)?.divide(&fill_null_s.shift(n)) } else { - Ok(Series::full_null(s.name(), s.len(), s.dtype())) + Ok(Series::full_null(s.name().clone(), s.len(), s.dtype())) } } diff --git a/crates/polars-ops/src/series/ops/rank.rs b/crates/polars-ops/src/series/ops/rank.rs index 0c57307626c0..4021443a3534 100644 --- a/crates/polars-ops/src/series/ops/rank.rs +++ b/crates/polars-ops/src/series/ops/rank.rs @@ -71,20 +71,20 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> Average => DataType::Float64, _ => IDX_DTYPE, }; - return Series::full_null(s.name(), s.len(), &dt); + return Series::full_null(s.name().clone(), s.len(), &dt); } match len { 1 => { return match method { - Average => Series::new(s.name(), &[1.0f64]), - _ => Series::new(s.name(), &[1 as IdxSize]), + Average => Series::new(s.name().clone(), &[1.0f64]), + _ => Series::new(s.name().clone(), &[1 as IdxSize]), }; }, 0 => { return match method { - Average => Float64Chunked::from_slice(s.name(), &[]).into_series(), - _ => IdxCa::from_slice(s.name(), &[]).into_series(), + Average => Float64Chunked::from_slice(s.name().clone(), &[]).into_series(), + _ => IdxCa::from_slice(s.name().clone(), &[]).into_series(), }; }, _ => {}, @@ -92,8 +92,8 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> if null_count == len { return match method { - Average => Float64Chunked::full_null(s.name(), len).into_series(), - _ => IdxCa::full_null(s.name(), len).into_series(), + Average => Float64Chunked::full_null(s.name().clone(), len).into_series(), + _ => IdxCa::full_null(s.name().clone(), len).into_series(), }; } @@ -118,7 +118,7 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> rank += 1; } } - IdxCa::from_vec_validity(s.name(), out, validity).into_series() + IdxCa::from_vec_validity(s.name().clone(), out, validity).into_series() } else { let sorted_values = unsafe { s.take_unchecked(&sort_idx_ca) }; let not_consecutive_same = sorted_values @@ -141,7 +141,7 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> rank += 1; } }); - IdxCa::from_vec_validity(s.name(), out, validity).into_series() + IdxCa::from_vec_validity(s.name().clone(), out, validity).into_series() }, Average => unsafe { let mut out = vec![0.0; s.len()]; @@ -154,7 +154,7 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> *out.get_unchecked_mut(*i as usize) = avg; } }); - Float64Chunked::from_vec_validity(s.name(), out, validity).into_series() + Float64Chunked::from_vec_validity(s.name().clone(), out, validity).into_series() }, Min => unsafe { let mut out = vec![0 as IdxSize; s.len()]; @@ -164,7 +164,7 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> } rank += ties.len() as IdxSize; }); - IdxCa::from_vec_validity(s.name(), out, validity).into_series() + IdxCa::from_vec_validity(s.name().clone(), out, validity).into_series() }, Max => unsafe { let mut out = vec![0 as IdxSize; s.len()]; @@ -174,7 +174,7 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> *out.get_unchecked_mut(*i as usize) = rank - 1; } }); - IdxCa::from_vec_validity(s.name(), out, validity).into_series() + IdxCa::from_vec_validity(s.name().clone(), out, validity).into_series() }, Dense => unsafe { let mut out = vec![0 as IdxSize; s.len()]; @@ -184,7 +184,7 @@ fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option) -> } rank += 1; }); - IdxCa::from_vec_validity(s.name(), out, validity).into_series() + IdxCa::from_vec_validity(s.name().clone(), out, validity).into_series() }, Ordinal => unreachable!(), } @@ -205,7 +205,7 @@ mod test { #[test] fn test_rank() -> PolarsResult<()> { - let s = Series::new("a", &[1, 2, 3, 2, 2, 3, 0]); + let s = Series::new("a".into(), &[1, 2, 3, 2, 2, 3, 0]); let out = rank(&s, RankMethod::Ordinal, false, None) .idx()? @@ -253,7 +253,7 @@ mod test { assert_eq!(out, &[2.0f64, 4.0, 6.5, 4.0, 4.0, 6.5, 1.0]); let s = Series::new( - "a", + "a".into(), &[Some(1), Some(2), Some(3), Some(2), None, None, Some(0)], ); @@ -275,7 +275,7 @@ mod test { ] ); let s = Series::new( - "a", + "a".into(), &[ Some(5), Some(6), @@ -310,7 +310,7 @@ mod test { #[test] fn test_rank_all_null() -> PolarsResult<()> { - let s = UInt32Chunked::new("", &[None, None, None]).into_series(); + let s = UInt32Chunked::new("".into(), &[None, None, None]).into_series(); let out = rank(&s, RankMethod::Average, false, None) .f64()? .into_iter() @@ -326,7 +326,7 @@ mod test { #[test] fn test_rank_empty() { - let s = UInt32Chunked::from_slice("", &[]).into_series(); + let s = UInt32Chunked::from_slice("".into(), &[]).into_series(); let out = rank(&s, RankMethod::Average, false, None); assert_eq!(out.dtype(), &DataType::Float64); let out = rank(&s, RankMethod::Max, false, None); @@ -335,7 +335,7 @@ mod test { #[test] fn test_rank_reverse() -> PolarsResult<()> { - let s = Series::new("", &[None, Some(1), Some(1), Some(5), None]); + let s = Series::new("".into(), &[None, Some(1), Some(1), Some(5), None]); let out = rank(&s, RankMethod::Dense, true, None) .idx()? .into_iter() diff --git a/crates/polars-ops/src/series/ops/replace.rs b/crates/polars-ops/src/series/ops/replace.rs index a331078318ea..438de7956325 100644 --- a/crates/polars-ops/src/series/ops/replace.rs +++ b/crates/polars-ops/src/series/ops/replace.rs @@ -138,7 +138,10 @@ fn replace_by_single_strict(s: &Series, old: &Series, new: &Series) -> PolarsRes // Transfer validity from `mask` to `out`. if mask.null_count() > 0 { - out = out.zip_with(&mask, &Series::new_null("", s.len()))? + out = out.zip_with( + &mask, + &Series::new_null(PlSmallStr::const_default(), s.len()), + )? } Ok(out) } @@ -169,7 +172,7 @@ fn replace_by_multiple( let joined = df.join( &replacer, - [s.name()], + [s.name().as_str()], ["__POLARS_REPLACE_OLD"], JoinArgs { how: JoinType::Left, @@ -207,7 +210,7 @@ fn replace_by_multiple_strict(s: &Series, old: Series, new: Series) -> PolarsRes let joined = df.join( &replacer, - [s.name()], + [s.name().as_str()], ["__POLARS_REPLACE_OLD"], JoinArgs { how: JoinType::Left, @@ -231,11 +234,12 @@ fn replace_by_multiple_strict(s: &Series, old: Series, new: Series) -> PolarsRes // Build replacer dataframe. fn create_replacer(mut old: Series, mut new: Series, add_mask: bool) -> PolarsResult { - old.rename("__POLARS_REPLACE_OLD"); - new.rename("__POLARS_REPLACE_NEW"); + old.rename(PlSmallStr::from_static("__POLARS_REPLACE_OLD")); + new.rename(PlSmallStr::from_static("__POLARS_REPLACE_NEW")); let cols = if add_mask { - let mask = Series::new("__POLARS_REPLACE_MASK", &[true]).new_from_index(0, new.len()); + let mask = Series::new(PlSmallStr::from_static("__POLARS_REPLACE_MASK"), &[true]) + .new_from_index(0, new.len()); vec![old, new, mask] } else { vec![old, new] diff --git a/crates/polars-ops/src/series/ops/rle.rs b/crates/polars-ops/src/series/ops/rle.rs index 671f5c561f39..8659512673f1 100644 --- a/crates/polars-ops/src/series/ops/rle.rs +++ b/crates/polars-ops/src/series/ops/rle.rs @@ -9,7 +9,7 @@ pub fn rle(s: &Series) -> PolarsResult { let mut lengths = Vec::::with_capacity(n_runs as usize); lengths.push(1); - let mut vals = Series::new_empty("value", s.dtype()); + let mut vals = Series::new_empty(PlSmallStr::from_static("value"), s.dtype()); let vals = vals.extend(&s.head(Some(1)))?.extend(&s2.filter(&s_neq)?)?; let mut idx = 0; @@ -25,14 +25,17 @@ pub fn rle(s: &Series) -> PolarsResult { } } - let outvals = vec![Series::from_vec("len", lengths), vals.to_owned()]; - Ok(StructChunked::from_series(s.name(), &outvals)?.into_series()) + let outvals = vec![ + Series::from_vec(PlSmallStr::from_static("len"), lengths), + vals.to_owned(), + ]; + Ok(StructChunked::from_series(s.name().clone(), &outvals)?.into_series()) } /// Similar to `rle`, but maps values to run IDs. pub fn rle_id(s: &Series) -> PolarsResult { if s.len() == 0 { - return Ok(Series::new_empty(s.name(), &IDX_DTYPE)); + return Ok(Series::new_empty(s.name().clone(), &IDX_DTYPE)); } let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len())); let s_neq = s1.not_equal_missing(&s2)?; @@ -47,7 +50,7 @@ pub fn rle_id(s: &Series) -> PolarsResult { out.push(last); } } - Ok(IdxCa::from_vec(s.name(), out) + Ok(IdxCa::from_vec(s.name().clone(), out) .with_sorted_flag(IsSorted::Ascending) .into_series()) } diff --git a/crates/polars-ops/src/series/ops/round.rs b/crates/polars-ops/src/series/ops/round.rs index 2ee6c284d1b2..7ed6b2e40eed 100644 --- a/crates/polars-ops/src/series/ops/round.rs +++ b/crates/polars-ops/src/series/ops/round.rs @@ -101,7 +101,7 @@ mod test { #[test] fn test_round_series() { - let series = Series::new("a", &[1.003, 2.23222, 3.4352]); + let series = Series::new("a".into(), &[1.003, 2.23222, 3.4352]); let out = series.round(2).unwrap(); let ca = out.f64().unwrap(); assert_eq!(ca.get(0), Some(1.0)); diff --git a/crates/polars-ops/src/series/ops/search_sorted.rs b/crates/polars-ops/src/series/ops/search_sorted.rs index ee429949ce6f..11e97ef489e8 100644 --- a/crates/polars-ops/src/series/ops/search_sorted.rs +++ b/crates/polars-ops/src/series/ops/search_sorted.rs @@ -19,7 +19,7 @@ pub fn search_sorted( let search_values = search_values.str()?; let search_values = search_values.as_binary(); let idx = binary_search_ca(&ca, search_values.iter(), side, descending); - Ok(IdxCa::new_vec(s.name(), idx)) + Ok(IdxCa::new_vec(s.name().clone(), idx)) }, DataType::Boolean => { let ca = s.bool().unwrap(); @@ -41,7 +41,7 @@ pub fn search_sorted( }) }) .collect(); - Ok(IdxCa::new_vec(s.name(), idxs)) + Ok(IdxCa::new_vec(s.name().clone(), idxs)) }, DataType::Binary => { let ca = s.binary().unwrap(); @@ -58,7 +58,7 @@ pub fn search_sorted( _ => unreachable!(), }; - Ok(IdxCa::new_vec(s.name(), idx)) + Ok(IdxCa::new_vec(s.name().clone(), idx)) }, dt if dt.is_numeric() => { let search_values = search_values.to_physical_repr(); @@ -68,7 +68,7 @@ pub fn search_sorted( let search_values: &ChunkedArray<$T> = search_values.as_ref().as_ref().as_ref(); binary_search_ca(ca, search_values.iter(), side, descending) }); - Ok(IdxCa::new_vec(s.name(), idx)) + Ok(IdxCa::new_vec(s.name().clone(), idx)) }, _ => polars_bail!(opq = search_sorted, original_dtype), } diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index f2d8c4f3b70a..3cd9d426ac1d 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -1,3 +1,5 @@ +use polars_utils::format_pl_smallstr; + use super::*; #[cfg(feature = "dtype-u8")] @@ -28,18 +30,16 @@ impl ToDummies for Series { // strings are formatted with extra \" \" in polars, so we // extract the string let name = if let Some(s) = av.get_str() { - format!("{col_name}{sep}{s}") + format_pl_smallstr!("{col_name}{sep}{s}") } else { // other types don't have this formatting issue - format!("{col_name}{sep}{av}") + format_pl_smallstr!("{col_name}{sep}{av}") }; let ca = match group { - GroupsIndicator::Idx((_, group)) => { - dummies_helper_idx(group, self.len(), &name) - }, + GroupsIndicator::Idx((_, group)) => dummies_helper_idx(group, self.len(), name), GroupsIndicator::Slice([offset, len]) => { - dummies_helper_slice(offset, len, self.len(), &name) + dummies_helper_slice(offset, len, self.len(), name) }, }; ca.into_series() @@ -50,7 +50,7 @@ impl ToDummies for Series { } } -fn dummies_helper_idx(groups: &[IdxSize], len: usize, name: &str) -> DummyCa { +fn dummies_helper_idx(groups: &[IdxSize], len: usize, name: PlSmallStr) -> DummyCa { let mut av = vec![0 as DummyType; len]; for &idx in groups { @@ -65,7 +65,7 @@ fn dummies_helper_slice( group_offset: IdxSize, group_len: IdxSize, len: usize, - name: &str, + name: PlSmallStr, ) -> DummyCa { let mut av = vec![0 as DummyType; len]; diff --git a/crates/polars-ops/src/series/ops/unique.rs b/crates/polars-ops/src/series/ops/unique.rs index 3a2d9b5652fe..e48509b1ce73 100644 --- a/crates/polars-ops/src/series/ops/unique.rs +++ b/crates/polars-ops/src/series/ops/unique.rs @@ -41,9 +41,9 @@ pub fn unique_counts(s: &Series) -> PolarsResult { }, DataType::Null => { let ca = if s.is_empty() { - IdxCa::new(s.name(), [] as [IdxSize; 0]) + IdxCa::new(s.name().clone(), [] as [IdxSize; 0]) } else { - IdxCa::new(s.name(), [s.len() as IdxSize]) + IdxCa::new(s.name().clone(), [s.len() as IdxSize]) }; Ok(ca.into_series()) }, diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs index cfb4e7fc35f0..c0de2c54d0c5 100644 --- a/crates/polars-ops/src/series/ops/various.rs +++ b/crates/polars-ops/src/series/ops/various.rs @@ -16,18 +16,19 @@ pub trait SeriesMethods: SeriesSealed { &self, sort: bool, parallel: bool, - name: String, + name: PlSmallStr, normalize: bool, ) -> PolarsResult { let s = self.as_series(); polars_ensure!( - s.name() != name, - Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate column names; change `name` to fix", name, + s.name() != &name, + Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \ + column names; change `name` to fix", name, ); // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined let groups = s.group_tuples(parallel, sort)?; let values = unsafe { s.agg_first(&groups) }; - let counts = groups.group_count().with_name(name.as_str()); + let counts = groups.group_count().with_name(name.clone()); let counts = if normalize { let len = s.len() as f64; @@ -63,7 +64,7 @@ pub trait SeriesMethods: SeriesSealed { _ => { let mut h = vec![]; s.0.vec_hash(build_hasher, &mut h).unwrap(); - UInt64Chunked::from_vec(s.name(), h) + UInt64Chunked::from_vec(s.name().clone(), h) }, } } @@ -93,7 +94,7 @@ pub trait SeriesMethods: SeriesSealed { #[cfg(feature = "dtype-struct")] if matches!(s.dtype(), DataType::Struct(_)) { let encoded = _get_rows_encoded_ca( - "", + PlSmallStr::const_default(), &[s.clone()], &[options.descending], &[options.nulls_last], diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs index af7868517673..a1f3054bd852 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs @@ -41,11 +41,8 @@ impl<'a> utils::StateTranslation<'a, BinViewDecoder> for StateTranslation<'a> { Ok(Self::Plain(values)) }, - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict)) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(_)) => { let values = dict_indices_decoder(page)?; - if is_string { - arrow::array::validate_utf8_view(dict.0.as_ref(), dict.1.as_ref())?; - } Ok(Self::Dictionary(values)) }, (Encoding::DeltaLengthByteArray, _) => { @@ -93,6 +90,7 @@ impl<'a> utils::StateTranslation<'a, BinViewDecoder> for StateTranslation<'a> { &mut self, decoder: &mut BinViewDecoder, decoded: &mut ::DecodedState, + is_optional: bool, page_validity: &mut Option>, dict: Option<&'a ::Dict>, additional: usize, @@ -107,6 +105,7 @@ impl<'a> utils::StateTranslation<'a, BinViewDecoder> for StateTranslation<'a> { decoder.decode_plain_encoded( decoded, page_values, + is_optional, page_validity.as_mut(), additional, )?; @@ -120,6 +119,7 @@ impl<'a> utils::StateTranslation<'a, BinViewDecoder> for StateTranslation<'a> { decoder.decode_dictionary_encoded( decoded, page, + is_optional, page_validity.as_mut(), dict, additional, @@ -138,7 +138,13 @@ impl<'a> utils::StateTranslation<'a, BinViewDecoder> for StateTranslation<'a> { }; match page_validity { - None => (&mut collector).push_n(values, additional)?, + None => { + (&mut collector).push_n(values, additional)?; + + if is_optional { + validity.extend_constant(additional, true); + } + }, Some(page_validity) => extend_from_decoder( validity, page_validity, @@ -158,7 +164,13 @@ impl<'a> utils::StateTranslation<'a, BinViewDecoder> for StateTranslation<'a> { }; match page_validity { - None => collector.push_n(values, additional)?, + None => { + collector.push_n(values, additional)?; + + if is_optional { + validity.extend_constant(additional, true); + } + }, Some(page_validity) => extend_from_decoder( validity, page_validity, @@ -536,7 +548,7 @@ impl utils::Decoder for BinViewDecoder { Ok(()) } - fn deserialize_dict(&self, page: DictPage) -> Self::Dict { + fn deserialize_dict(&self, page: DictPage) -> ParquetResult { let values = &page.buffer; let num_values = page.num_values; @@ -549,9 +561,12 @@ impl utils::Decoder for BinViewDecoder { let mut buffers = Vec::with_capacity(1); let mut offset = 0; - for v in BinaryIter::new(values, num_values) { - if v.len() <= View::MAX_INLINE_SIZE as usize { - views.push(View::new_inline(v)); + let mut max_length = 0; + views.extend(BinaryIter::new(values, num_values).map(|v| { + let length = v.len(); + max_length = usize::max(length, max_length); + if length <= View::MAX_INLINE_SIZE as usize { + View::new_inline(v) } else { if offset >= u32::MAX as usize { let full_buffer = std::mem::take(&mut buffer); @@ -562,20 +577,39 @@ impl utils::Decoder for BinViewDecoder { } buffer.extend_from_slice(v); - views.push(View::new_from_bytes(v, buffers.len() as u32, offset as u32)); + let view = View::new_from_bytes(v, buffers.len() as u32, offset as u32); offset += v.len(); + view } - } + })); buffers.push(Buffer::from(buffer)); - (views, buffers) + if self.check_utf8.load(Ordering::Relaxed) { + // This is a small trick that allows us to check the Parquet buffer instead of the view + // buffer. Batching the UTF-8 verification is more performant. For this to be allowed, + // all the interleaved lengths need to be valid UTF-8. + // + // Every strings prepended by 4 bytes (L, 0, 0, 0), since we check here L < 128. L is + // only a valid first byte of a UTF-8 code-point and (L, 0, 0, 0) is valid UTF-8. + // Consequently, it is valid to just check the whole buffer. + if max_length < 128 { + simdutf8::basic::from_utf8(values) + .map_err(|_| ParquetError::oos("String data contained invalid UTF-8"))?; + } else { + arrow::array::validate_utf8_view(&views, &buffers) + .map_err(|_| ParquetError::oos("String data contained invalid UTF-8"))?; + } + } + + Ok((views, buffers)) } fn decode_plain_encoded<'a>( &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut as utils::StateTranslation<'a, Self>>::PlainDecoder, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, limit: usize, ) -> ParquetResult<()> { @@ -629,7 +663,13 @@ impl utils::Decoder for BinViewDecoder { }; match page_validity { - None => collector.push_n(values, limit)?, + None => { + collector.push_n(values, limit)?; + + if is_optional { + validity.extend_constant(limit, true); + } + }, Some(page_validity) => { extend_from_decoder(validity, page_validity, Some(limit), values, collector)? }, @@ -641,6 +681,10 @@ impl utils::Decoder for BinViewDecoder { // This is a small trick that allows us to check the Parquet buffer instead of the view // buffer. Batching the UTF-8 verification is more performant. For this to be allowed, // all the interleaved lengths need to be valid UTF-8. + // + // Every strings prepended by 4 bytes (L, 0, 0, 0), since we check here L < 128. L is + // only a valid first byte of a UTF-8 code-point and (L, 0, 0, 0) is valid UTF-8. + // Consequently, it is valid to just check the whole buffer. if max_length < 128 { simdutf8::basic::from_utf8(buffer) .map_err(|_| ParquetError::oos("String data contained invalid UTF-8"))?; @@ -658,6 +702,7 @@ impl utils::Decoder for BinViewDecoder { &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut hybrid_rle::HybridRleDecoder<'a>, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, dict: &Self::Dict, limit: usize, @@ -747,6 +792,10 @@ impl utils::Decoder for BinViewDecoder { match page_validity { None => { page_values.gather_n_into(values, limit, &translator)?; + + if is_optional { + validity.extend_constant(limit, true); + } }, Some(page_validity) => { struct Collector<'a, 'b> { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs index 79808d2b388d..e99e7a5ed56c 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs @@ -89,6 +89,7 @@ impl<'a> utils::StateTranslation<'a, BooleanDecoder> for StateTranslation<'a> { &mut self, decoder: &mut BooleanDecoder, decoded: &mut ::DecodedState, + is_optional: bool, page_validity: &mut Option>, _: Option<&'a ::Dict>, additional: usize, @@ -97,13 +98,20 @@ impl<'a> utils::StateTranslation<'a, BooleanDecoder> for StateTranslation<'a> { Self::Plain(page_values) => decoder.decode_plain_encoded( decoded, page_values, + is_optional, page_validity.as_mut(), additional, )?, Self::Rle(page_values) => { let (values, validity) = decoded; match page_validity { - None => page_values.gather_n_into(values, additional, &BitmapGatherer)?, + None => { + page_values.gather_n_into(values, additional, &BitmapGatherer)?; + + if is_optional { + validity.extend_constant(additional, true); + } + }, Some(page_validity) => utils::extend_from_decoder( validity, page_validity, @@ -199,17 +207,26 @@ impl Decoder for BooleanDecoder { ) } - fn deserialize_dict(&self, _: DictPage) -> Self::Dict {} + fn deserialize_dict(&self, _: DictPage) -> ParquetResult { + Ok(()) + } fn decode_plain_encoded<'a>( &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut as utils::StateTranslation<'a, Self>>::PlainDecoder, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, limit: usize, ) -> ParquetResult<()> { match page_validity { - None => page_values.collect_n_into(values, limit), + None => { + page_values.collect_n_into(values, limit); + + if is_optional { + validity.extend_constant(limit, true); + } + }, Some(page_validity) => { extend_from_decoder(validity, page_validity, Some(limit), values, page_values)? }, @@ -222,6 +239,7 @@ impl Decoder for BooleanDecoder { &mut self, _decoded: &mut Self::DecodedState, _page_values: &mut HybridRleDecoder<'a>, + _is_optional: bool, _page_validity: Option<&mut PageValidity<'a>>, _dict: &Self::Dict, _limit: usize, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs index 09f9807ae5ac..db718ed9c330 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs @@ -47,6 +47,7 @@ impl<'a, K: DictionaryKey, D: utils::DictDecodable> StateTranslation<'a, Diction &mut self, decoder: &mut DictionaryDecoder, decoded: &mut as Decoder>::DecodedState, + is_optional: bool, page_validity: &mut Option>, _: Option<&'a as Decoder>::Dict>, additional: usize, @@ -65,7 +66,13 @@ impl<'a, K: DictionaryKey, D: utils::DictDecodable> StateTranslation<'a, Diction }; match page_validity { - None => collector.push_n(&mut decoded.0, additional)?, + None => { + collector.push_n(&mut decoded.0, additional)?; + + if is_optional { + validity.extend_constant(additional, true); + } + }, Some(page_validity) => { extend_from_decoder(validity, page_validity, Some(additional), values, collector)? }, @@ -105,11 +112,11 @@ impl utils::Decoder for DictionaryDec ) } - fn deserialize_dict(&self, page: DictPage) -> Self::Dict { - let dict = self.decoder.deserialize_dict(page); + fn deserialize_dict(&self, page: DictPage) -> ParquetResult { + let dict = self.decoder.deserialize_dict(page)?; self.dict_size .store(dict.len(), std::sync::atomic::Ordering::Relaxed); - dict + Ok(dict) } fn finalize( @@ -129,6 +136,7 @@ impl utils::Decoder for DictionaryDec &mut self, _decoded: &mut Self::DecodedState, _page_values: &mut as StateTranslation<'a, Self>>::PlainDecoder, + _is_optional: bool, _page_validity: Option<&mut PageValidity<'a>>, _limit: usize, ) -> ParquetResult<()> { @@ -139,6 +147,7 @@ impl utils::Decoder for DictionaryDec &mut self, _decoded: &mut Self::DecodedState, _page_values: &mut HybridRleDecoder<'a>, + _is_optional: bool, _page_validity: Option<&mut PageValidity<'a>>, _dict: &Self::Dict, _limit: usize, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs index 8473faa9c56f..b4fcd2c38e7d 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs @@ -75,6 +75,7 @@ impl<'a> utils::StateTranslation<'a, BinaryDecoder> for StateTranslation<'a> { &mut self, decoder: &mut BinaryDecoder, decoded: &mut ::DecodedState, + is_optional: bool, page_validity: &mut Option>, dict: Option<&'a ::Dict>, additional: usize, @@ -84,12 +85,14 @@ impl<'a> utils::StateTranslation<'a, BinaryDecoder> for StateTranslation<'a> { T::Plain(page_values, _) => decoder.decode_plain_encoded( decoded, page_values, + is_optional, page_validity.as_mut(), additional, )?, T::Dictionary(page_values) => decoder.decode_dictionary_encoded( decoded, page_values, + is_optional, page_validity.as_mut(), dict.unwrap(), additional, @@ -134,14 +137,15 @@ impl Decoder for BinaryDecoder { ) } - fn deserialize_dict(&self, page: DictPage) -> Self::Dict { - page.buffer.into_vec() + fn deserialize_dict(&self, page: DictPage) -> ParquetResult { + Ok(page.buffer.into_vec()) } fn decode_plain_encoded<'a>( &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut as utils::StateTranslation<'a, Self>>::PlainDecoder, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, limit: usize, ) -> ParquetResult<()> { @@ -180,7 +184,13 @@ impl Decoder for BinaryDecoder { }; match page_validity { - None => collector.push_n(&mut values.values, self.size)?, + None => { + collector.push_n(&mut values.values, limit)?; + + if is_optional { + validity.extend_constant(limit, true); + } + }, Some(page_validity) => extend_from_decoder( validity, page_validity, @@ -197,6 +207,7 @@ impl Decoder for BinaryDecoder { &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut hybrid_rle::HybridRleDecoder<'a>, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, dict: &Self::Dict, limit: usize, @@ -274,6 +285,10 @@ impl Decoder for BinaryDecoder { match page_validity { None => { page_values.gather_n_into(&mut values.values, limit, &gatherer)?; + + if is_optional { + validity.extend_constant(limit, true); + } }, Some(page_validity) => { let collector = GatheredHybridRle::new(page_values, &gatherer, null_value); diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs index 9c79e1083585..a2076014a966 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -162,7 +162,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::unit(), + primitive::FloatDecoder::::unit(), init, )? .collect_n(filter) @@ -174,7 +174,7 @@ pub fn columns_to_iter_recursive( PageNestedDecoder::new( columns.pop().unwrap(), field.data_type().clone(), - primitive::PrimitiveDecoder::::unit(), + primitive::FloatDecoder::::unit(), init, )? .collect_n(filter) @@ -524,14 +524,14 @@ fn dict_read( Float32 => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new(primitive::PrimitiveDecoder::::unit()), + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::unit()), init, )? .collect_n(filter)?, Float64 => PageNestedDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new(primitive::PrimitiveDecoder::::unit()), + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::unit()), init, )? .collect_n(filter)?, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index dbf41ee7579c..42e321a2f570 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -742,7 +742,7 @@ impl PageNestedDecoder { init: Vec, ) -> ParquetResult { let dict_page = iter.read_dict_page()?; - let dict = dict_page.map(|d| decoder.deserialize_dict(d)); + let dict = dict_page.map(|d| decoder.deserialize_dict(d)).transpose()?; Ok(Self { iter, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/null.rs b/crates/polars-parquet/src/arrow/read/deserialize/null.rs index 5a4c68d0acd7..8066c1d73af3 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/null.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null.rs @@ -47,6 +47,7 @@ impl<'a> utils::StateTranslation<'a, NullDecoder> for () { &mut self, _decoder: &mut NullDecoder, decoded: &mut ::DecodedState, + _is_optional: bool, _page_validity: &mut Option>, _: Option<&'a ::Dict>, additional: usize, @@ -67,12 +68,15 @@ impl utils::Decoder for NullDecoder { NullArrayLength { length: 0 } } - fn deserialize_dict(&self, _: DictPage) -> Self::Dict {} + fn deserialize_dict(&self, _: DictPage) -> ParquetResult { + Ok(()) + } fn decode_plain_encoded<'a>( &mut self, _decoded: &mut Self::DecodedState, _page_values: &mut as utils::StateTranslation<'a, Self>>::PlainDecoder, + _is_optional: bool, _page_validity: Option<&mut utils::PageValidity<'a>>, _limit: usize, ) -> ParquetResult<()> { @@ -83,6 +87,7 @@ impl utils::Decoder for NullDecoder { &mut self, _decoded: &mut Self::DecodedState, _page_values: &mut hybrid_rle::HybridRleDecoder<'a>, + _is_optional: bool, _page_validity: Option<&mut utils::PageValidity<'a>>, _dict: &Self::Dict, _limit: usize, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/float.rs similarity index 63% rename from crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs rename to crates/polars-parquet/src/arrow/read/deserialize/primitive/float.rs index 552f89d9b7c3..1c09ea3f7e87 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/float.rs @@ -4,6 +4,10 @@ use arrow::datatypes::ArrowDataType; use arrow::types::NativeType; use super::super::utils; +use super::{ + deserialize_plain, AsDecoderFunction, ClosureDecoderFunction, DecoderFunction, + PlainDecoderFnCollector, PrimitiveDecoder, UnitDecoderFunction, +}; use crate::parquet::encoding::hybrid_rle::DictionaryTranslator; use crate::parquet::encoding::{byte_stream_split, hybrid_rle, Encoding}; use crate::parquet::error::ParquetResult; @@ -15,126 +19,6 @@ use crate::read::deserialize::utils::{ TranslatedHybridRle, }; -/// A function that defines how to decode from the -/// [`parquet::types::NativeType`][ParquetNativeType] to the [`arrow::types::NativeType`]. -/// -/// This should almost always be inlined. -pub(crate) trait DecoderFunction: Copy -where - T: NativeType, - P: ParquetNativeType, -{ - fn decode(self, x: P) -> T; -} - -#[derive(Default, Clone, Copy)] -pub(crate) struct UnitDecoderFunction(std::marker::PhantomData); -impl DecoderFunction for UnitDecoderFunction { - #[inline(always)] - fn decode(self, x: T) -> T { - x - } -} - -#[derive(Default, Clone, Copy)] -pub(crate) struct AsDecoderFunction(std::marker::PhantomData<(P, T)>); -macro_rules! as_decoder_impl { - ($($p:ty => $t:ty,)+) => { - $( - impl DecoderFunction<$p, $t> for AsDecoderFunction<$p, $t> { - #[inline(always)] - fn decode(self, x : $p) -> $t { - x as $t - } - } - )+ - }; -} - -as_decoder_impl![ - i32 => i8, - i32 => i16, - i32 => u8, - i32 => u16, - i32 => u32, - i64 => i32, - i64 => u32, - i64 => u64, -]; - -#[derive(Default, Clone, Copy)] -pub(crate) struct IntoDecoderFunction(std::marker::PhantomData<(P, T)>); -impl DecoderFunction for IntoDecoderFunction -where - P: ParquetNativeType + Into, - T: NativeType, -{ - #[inline(always)] - fn decode(self, x: P) -> T { - x.into() - } -} - -#[derive(Clone, Copy)] -pub(crate) struct ClosureDecoderFunction(F, std::marker::PhantomData<(P, T)>); -impl DecoderFunction for ClosureDecoderFunction -where - P: ParquetNativeType, - T: NativeType, - F: Copy + Fn(P) -> T, -{ - #[inline(always)] - fn decode(self, x: P) -> T { - (self.0)(x) - } -} - -pub(crate) struct PlainDecoderFnCollector<'a, 'b, P, T, D> -where - T: NativeType, - P: ParquetNativeType, - D: DecoderFunction, -{ - pub(crate) chunks: &'b mut ArrayChunks<'a, P>, - pub(crate) decoder: D, - pub(crate) _pd: std::marker::PhantomData, -} - -impl<'a, 'b, P, T, D: DecoderFunction> BatchableCollector<(), Vec> - for PlainDecoderFnCollector<'a, 'b, P, T, D> -where - T: NativeType, - P: ParquetNativeType, - D: DecoderFunction, -{ - fn reserve(target: &mut Vec, n: usize) { - target.reserve(n); - } - - fn push_n(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { - let n = usize::min(self.chunks.len(), n); - let (items, remainder) = self.chunks.bytes.split_at(n); - let decoder = self.decoder; - target.extend( - items - .iter() - .map(|chunk| decoder.decode(P::from_le_bytes(*chunk))), - ); - self.chunks.bytes = remainder; - Ok(()) - } - - fn push_n_nulls(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { - target.resize(target.len() + n, T::default()); - Ok(()) - } - - fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { - self.chunks.skip_in_place(n); - Ok(()) - } -} - #[allow(clippy::large_enum_variant)] #[derive(Debug)] pub(crate) enum StateTranslation<'a, P: ParquetNativeType> { @@ -143,7 +27,7 @@ pub(crate) enum StateTranslation<'a, P: ParquetNativeType> { ByteStreamSplit(byte_stream_split::Decoder<'a>), } -impl<'a, P, T, D> utils::StateTranslation<'a, PrimitiveDecoder> for StateTranslation<'a, P> +impl<'a, P, T, D> utils::StateTranslation<'a, FloatDecoder> for StateTranslation<'a, P> where T: NativeType, P: ParquetNativeType, @@ -152,9 +36,9 @@ where type PlainDecoder = ArrayChunks<'a, P>; fn new( - _decoder: &PrimitiveDecoder, + _decoder: &FloatDecoder, page: &'a DataPage, - dict: Option<&'a as utils::Decoder>::Dict>, + dict: Option<&'a as utils::Decoder>::Dict>, _page_validity: Option<&PageValidity<'a>>, ) -> ParquetResult { match (page.encoding(), dict) { @@ -202,22 +86,25 @@ where fn extend_from_state( &mut self, - decoder: &mut PrimitiveDecoder, - decoded: &mut as utils::Decoder>::DecodedState, + decoder: &mut FloatDecoder, + decoded: &mut as utils::Decoder>::DecodedState, + is_optional: bool, page_validity: &mut Option>, - dict: Option<&'a as utils::Decoder>::Dict>, + dict: Option<&'a as utils::Decoder>::Dict>, additional: usize, ) -> ParquetResult<()> { match self { Self::Plain(page_values) => decoder.decode_plain_encoded( decoded, page_values, + is_optional, page_validity.as_mut(), additional, )?, Self::Dictionary(ref mut page) => decoder.decode_dictionary_encoded( decoded, page, + is_optional, page_validity.as_mut(), dict.unwrap(), additional, @@ -229,16 +116,20 @@ where None => { values.extend( page_values - .iter_converted(|v| decoder.decoder.decode(decode(v))) + .iter_converted(|v| decoder.0.decoder.decode(decode(v))) .take(additional), ); + + if is_optional { + validity.extend_constant(additional, true); + } }, Some(page_validity) => utils::extend_from_decoder( validity, page_validity, Some(additional), values, - &mut page_values.iter_converted(|v| decoder.decoder.decode(decode(v))), + &mut page_values.iter_converted(|v| decoder.0.decoder.decode(decode(v))), )?, } }, @@ -249,17 +140,13 @@ where } #[derive(Debug)] -pub(crate) struct PrimitiveDecoder +pub(crate) struct FloatDecoder(PrimitiveDecoder) where P: ParquetNativeType, T: NativeType, - D: DecoderFunction, -{ - pub(crate) decoder: D, - _pd: std::marker::PhantomData<(P, T)>, -} + D: DecoderFunction; -impl PrimitiveDecoder +impl FloatDecoder where P: ParquetNativeType, T: NativeType, @@ -267,14 +154,11 @@ where { #[inline] fn new(decoder: D) -> Self { - Self { - decoder, - _pd: std::marker::PhantomData, - } + Self(PrimitiveDecoder::new(decoder)) } } -impl PrimitiveDecoder> +impl FloatDecoder> where T: NativeType + ParquetNativeType, UnitDecoderFunction: Default + DecoderFunction, @@ -284,7 +168,7 @@ where } } -impl PrimitiveDecoder> +impl FloatDecoder> where P: ParquetNativeType, T: NativeType, @@ -295,18 +179,7 @@ where } } -impl PrimitiveDecoder> -where - P: ParquetNativeType, - T: NativeType, - IntoDecoderFunction: Default + DecoderFunction, -{ - pub(crate) fn cast_into() -> Self { - Self::new(IntoDecoderFunction::::default()) - } -} - -impl PrimitiveDecoder> +impl FloatDecoder> where P: ParquetNativeType, T: NativeType, @@ -323,7 +196,7 @@ impl utils::ExactSize for (Vec, MutableBitmap) { } } -impl utils::Decoder for PrimitiveDecoder +impl utils::Decoder for FloatDecoder where T: NativeType, P: ParquetNativeType, @@ -341,14 +214,15 @@ where ) } - fn deserialize_dict(&self, page: DictPage) -> Self::Dict { - deserialize_plain::(&page.buffer, self.decoder) + fn deserialize_dict(&self, page: DictPage) -> ParquetResult { + Ok(deserialize_plain::(&page.buffer, self.0.decoder)) } fn decode_plain_encoded<'a>( &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut as utils::StateTranslation<'a, Self>>::PlainDecoder, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, limit: usize, ) -> ParquetResult<()> { @@ -356,15 +230,19 @@ where None => { PlainDecoderFnCollector { chunks: page_values, - decoder: self.decoder, + decoder: self.0.decoder, _pd: std::marker::PhantomData, } .push_n(values, limit)?; + + if is_optional { + validity.extend_constant(limit, true); + } }, Some(page_validity) => { let collector = PlainDecoderFnCollector { chunks: page_values, - decoder: self.decoder, + decoder: self.0.decoder, _pd: std::marker::PhantomData, }; @@ -385,6 +263,7 @@ where &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut hybrid_rle::HybridRleDecoder<'a>, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, dict: &Self::Dict, limit: usize, @@ -394,6 +273,10 @@ where match page_validity { None => { page_values.translate_and_collect_n_into(values, limit, &translator)?; + + if is_optional { + validity.extend_constant(limit, true); + } }, Some(page_validity) => { let translated_hybridrle = TranslatedHybridRle::new(page_values, &translator); @@ -422,7 +305,7 @@ where } } -impl utils::DictDecodable for PrimitiveDecoder +impl utils::DictDecodable for FloatDecoder where T: NativeType, P: ParquetNativeType, @@ -445,7 +328,7 @@ where } } -impl utils::NestedDecoder for PrimitiveDecoder +impl utils::NestedDecoder for FloatDecoder where T: NativeType, P: ParquetNativeType, @@ -468,16 +351,3 @@ where values.resize(values.len() + n, T::default()); } } - -pub(super) fn deserialize_plain(values: &[u8], decoder: D) -> Vec -where - T: NativeType, - P: ParquetNativeType, - D: DecoderFunction, -{ - values - .chunks_exact(std::mem::size_of::

()) - .map(decode) - .map(|v| decoder.decode(v)) - .collect::>() -} diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs index bc729cb0ea9e..dfe3f2f09cd6 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs @@ -4,11 +4,11 @@ use arrow::datatypes::ArrowDataType; use arrow::types::NativeType; use super::super::utils; -use super::basic::{ - AsDecoderFunction, ClosureDecoderFunction, DecoderFunction, IntoDecoderFunction, - PlainDecoderFnCollector, PrimitiveDecoder, UnitDecoderFunction, +use super::{ + deserialize_plain, AsDecoderFunction, ClosureDecoderFunction, DecoderFunction, DeltaCollector, + DeltaTranslator, IntoDecoderFunction, PlainDecoderFnCollector, PrimitiveDecoder, + UnitDecoderFunction, }; -use super::{DeltaCollector, DeltaTranslator}; use crate::parquet::encoding::hybrid_rle::{self, DictionaryTranslator}; use crate::parquet::encoding::{byte_stream_split, delta_bitpacked, Encoding}; use crate::parquet::error::ParquetResult; @@ -99,6 +99,7 @@ where &mut self, decoder: &mut IntDecoder, decoded: &mut as utils::Decoder>::DecodedState, + is_optional: bool, page_validity: &mut Option>, dict: Option<&'a as utils::Decoder>::Dict>, additional: usize, @@ -107,12 +108,14 @@ where Self::Plain(page_values) => decoder.decode_plain_encoded( decoded, page_values, + is_optional, page_validity.as_mut(), additional, )?, Self::Dictionary(ref mut page) => decoder.decode_dictionary_encoded( decoded, page, + is_optional, page_validity.as_mut(), dict.unwrap(), additional, @@ -127,6 +130,10 @@ where .iter_converted(|v| decoder.0.decoder.decode(decode(v))) .take(additional), ); + + if is_optional { + validity.extend_constant(additional, true); + } }, Some(page_validity) => { utils::extend_from_decoder( @@ -149,7 +156,13 @@ where }; match page_validity { - None => page_values.gather_n_into(values, additional, &mut gatherer)?, + None => { + page_values.gather_n_into(values, additional, &mut gatherer)?; + + if is_optional { + validity.extend_constant(additional, true); + } + }, Some(page_validity) => utils::extend_from_decoder( validity, page_validity, @@ -185,8 +198,8 @@ where D: DecoderFunction, { #[inline] - fn new(decoder: PrimitiveDecoder) -> Self { - Self(decoder) + fn new(decoder: D) -> Self { + Self(PrimitiveDecoder::new(decoder)) } } @@ -197,7 +210,7 @@ where UnitDecoderFunction: Default + DecoderFunction, { pub(crate) fn unit() -> Self { - Self::new(PrimitiveDecoder::unit()) + Self::new(UnitDecoderFunction::::default()) } } @@ -209,7 +222,7 @@ where AsDecoderFunction: Default + DecoderFunction, { pub(crate) fn cast_as() -> Self { - Self::new(PrimitiveDecoder::cast_as()) + Self::new(AsDecoderFunction::::default()) } } @@ -221,7 +234,7 @@ where IntoDecoderFunction: Default + DecoderFunction, { pub(crate) fn cast_into() -> Self { - Self::new(PrimitiveDecoder::cast_into()) + Self::new(IntoDecoderFunction::::default()) } } @@ -233,7 +246,7 @@ where F: Copy + Fn(P) -> T, { pub(crate) fn closure(f: F) -> Self { - Self::new(PrimitiveDecoder::closure(f)) + Self::new(ClosureDecoderFunction(f, std::marker::PhantomData)) } } @@ -250,17 +263,21 @@ where type Output = PrimitiveArray; fn with_capacity(&self, capacity: usize) -> Self::DecodedState { - self.0.with_capacity(capacity) + ( + Vec::::with_capacity(capacity), + MutableBitmap::with_capacity(capacity), + ) } - fn deserialize_dict(&self, page: DictPage) -> Self::Dict { - self.0.deserialize_dict(page) + fn deserialize_dict(&self, page: DictPage) -> ParquetResult { + Ok(deserialize_plain::(&page.buffer, self.0.decoder)) } fn decode_plain_encoded<'a>( &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut as utils::StateTranslation<'a, Self>>::PlainDecoder, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, limit: usize, ) -> ParquetResult<()> { @@ -272,6 +289,10 @@ where _pd: Default::default(), } .push_n(values, limit)?; + + if is_optional { + validity.extend_constant(limit, true); + } }, Some(page_validity) => { let collector = PlainDecoderFnCollector { @@ -297,11 +318,20 @@ where &mut self, (values, validity): &mut Self::DecodedState, page_values: &mut hybrid_rle::HybridRleDecoder<'a>, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, dict: &Self::Dict, limit: usize, ) -> ParquetResult<()> { match page_validity { + None => { + let translator = DictionaryTranslator(dict); + page_values.translate_and_collect_n_into(values, limit, &translator)?; + + if is_optional { + validity.extend_constant(limit, true); + } + }, Some(page_validity) => { let translator = DictionaryTranslator(dict); let translated_hybridrle = TranslatedHybridRle::new(page_values, &translator); @@ -314,10 +344,6 @@ where translated_hybridrle, )?; }, - None => { - let translator = DictionaryTranslator(dict); - page_values.translate_and_collect_n_into(values, limit, &translator)?; - }, } Ok(()) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs index 22da6ff14895..1a9d50a66d31 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs @@ -1,19 +1,178 @@ use arrow::types::NativeType; use num_traits::AsPrimitive; -use crate::parquet::types::NativeType as ParquetNativeType; +use crate::parquet::types::{decode, NativeType as ParquetNativeType}; -mod basic; +mod float; mod integer; -pub(crate) use basic::PrimitiveDecoder; +pub(crate) use float::FloatDecoder; pub(crate) use integer::IntDecoder; -use self::basic::DecoderFunction; +use super::utils::array_chunks::ArrayChunks; use super::utils::BatchableCollector; use super::ParquetResult; use crate::parquet::encoding::delta_bitpacked::{self, DeltaGatherer}; +#[derive(Debug)] +pub(crate) struct PrimitiveDecoder +where + P: ParquetNativeType, + T: NativeType, + D: DecoderFunction, +{ + pub(crate) decoder: D, + _pd: std::marker::PhantomData<(P, T)>, +} + +impl PrimitiveDecoder +where + P: ParquetNativeType, + T: NativeType, + D: DecoderFunction, +{ + #[inline] + pub(crate) fn new(decoder: D) -> Self { + Self { + decoder, + _pd: std::marker::PhantomData, + } + } +} + +/// A function that defines how to decode from the +/// [`parquet::types::NativeType`][ParquetNativeType] to the [`arrow::types::NativeType`]. +/// +/// This should almost always be inlined. +pub(crate) trait DecoderFunction: Copy +where + T: NativeType, + P: ParquetNativeType, +{ + fn decode(self, x: P) -> T; +} + +#[derive(Default, Clone, Copy)] +pub(crate) struct UnitDecoderFunction(std::marker::PhantomData); +impl DecoderFunction for UnitDecoderFunction { + #[inline(always)] + fn decode(self, x: T) -> T { + x + } +} + +#[derive(Default, Clone, Copy)] +pub(crate) struct AsDecoderFunction(std::marker::PhantomData<(P, T)>); +macro_rules! as_decoder_impl { + ($($p:ty => $t:ty,)+) => { + $( + impl DecoderFunction<$p, $t> for AsDecoderFunction<$p, $t> { + #[inline(always)] + fn decode(self, x : $p) -> $t { + x as $t + } + } + )+ + }; +} + +as_decoder_impl![ + i32 => i8, + i32 => i16, + i32 => u8, + i32 => u16, + i32 => u32, + i64 => i32, + i64 => u32, + i64 => u64, +]; + +#[derive(Default, Clone, Copy)] +pub(crate) struct IntoDecoderFunction(std::marker::PhantomData<(P, T)>); +impl DecoderFunction for IntoDecoderFunction +where + P: ParquetNativeType + Into, + T: NativeType, +{ + #[inline(always)] + fn decode(self, x: P) -> T { + x.into() + } +} + +#[derive(Clone, Copy)] +pub(crate) struct ClosureDecoderFunction(F, std::marker::PhantomData<(P, T)>); +impl DecoderFunction for ClosureDecoderFunction +where + P: ParquetNativeType, + T: NativeType, + F: Copy + Fn(P) -> T, +{ + #[inline(always)] + fn decode(self, x: P) -> T { + (self.0)(x) + } +} + +pub(crate) struct PlainDecoderFnCollector<'a, 'b, P, T, D> +where + T: NativeType, + P: ParquetNativeType, + D: DecoderFunction, +{ + pub(crate) chunks: &'b mut ArrayChunks<'a, P>, + pub(crate) decoder: D, + pub(crate) _pd: std::marker::PhantomData, +} + +impl<'a, 'b, P, T, D: DecoderFunction> BatchableCollector<(), Vec> + for PlainDecoderFnCollector<'a, 'b, P, T, D> +where + T: NativeType, + P: ParquetNativeType, + D: DecoderFunction, +{ + fn reserve(target: &mut Vec, n: usize) { + target.reserve(n); + } + + fn push_n(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + let n = usize::min(self.chunks.len(), n); + let (items, remainder) = self.chunks.bytes.split_at(n); + let decoder = self.decoder; + target.extend( + items + .iter() + .map(|chunk| decoder.decode(P::from_le_bytes(*chunk))), + ); + self.chunks.bytes = remainder; + Ok(()) + } + + fn push_n_nulls(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + target.resize(target.len() + n, T::default()); + Ok(()) + } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.chunks.skip_in_place(n); + Ok(()) + } +} + +fn deserialize_plain(values: &[u8], decoder: D) -> Vec +where + T: NativeType, + P: ParquetNativeType, + D: DecoderFunction, +{ + values + .chunks_exact(std::mem::size_of::

()) + .map(decode) + .map(|v| decoder.decode(v)) + .collect::>() +} + struct DeltaTranslator where T: NativeType, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/simple.rs b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs index ba4a15814006..9d512d834ebf 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/simple.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs @@ -278,13 +278,13 @@ pub fn page_iter_to_array( (PhysicalType::Float, Float32) => Box::new(PageDecoder::new( pages, data_type, - primitive::PrimitiveDecoder::::unit(), + primitive::FloatDecoder::::unit(), )? .collect_n(filter)?), (PhysicalType::Double, Float64) => Box::new(PageDecoder::new( pages, data_type, - primitive::PrimitiveDecoder::::unit(), + primitive::FloatDecoder::::unit(), )? .collect_n(filter)?), // Don't compile this code with `i32` as we don't use this in polars @@ -393,7 +393,7 @@ fn timestamp( PageDecoder::new( pages, data_type, - primitive::PrimitiveDecoder::closure(|x: [u32; 3]| int96_to_i64_ns(x)), + primitive::FloatDecoder::closure(|x: [u32; 3]| int96_to_i64_ns(x)), )? .collect_n(filter)?, )), @@ -401,7 +401,7 @@ fn timestamp( PageDecoder::new( pages, data_type, - primitive::PrimitiveDecoder::closure(|x: [u32; 3]| int96_to_i64_us(x)), + primitive::FloatDecoder::closure(|x: [u32; 3]| int96_to_i64_us(x)), )? .collect_n(filter)?, )), @@ -409,7 +409,7 @@ fn timestamp( PageDecoder::new( pages, data_type, - primitive::PrimitiveDecoder::closure(|x: [u32; 3]| int96_to_i64_ms(x)), + primitive::FloatDecoder::closure(|x: [u32; 3]| int96_to_i64_ms(x)), )? .collect_n(filter)?, )), @@ -417,7 +417,7 @@ fn timestamp( PageDecoder::new( pages, data_type, - primitive::PrimitiveDecoder::closure(|x: [u32; 3]| int96_to_i64_s(x)), + primitive::FloatDecoder::closure(|x: [u32; 3]| int96_to_i64_s(x)), )? .collect_n(filter)?, )), @@ -473,7 +473,7 @@ fn timestamp_dict( (a, true) => PageDecoder::new( pages, ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), - dictionary::DictionaryDecoder::::new(primitive::PrimitiveDecoder::closure( + dictionary::DictionaryDecoder::::new(primitive::FloatDecoder::closure( |x: [u32; 3]| int96_to_i64_ns(x) * a, )), )? @@ -481,7 +481,7 @@ fn timestamp_dict( (a, false) => PageDecoder::new( pages, ArrowDataType::Timestamp(TimeUnit::Nanosecond, None), - dictionary::DictionaryDecoder::::new(primitive::PrimitiveDecoder::closure( + dictionary::DictionaryDecoder::::new(primitive::FloatDecoder::closure( |x: [u32; 3]| int96_to_i64_ns(x) / a, )), )? @@ -494,17 +494,13 @@ fn timestamp_dict( (a, true) => PageDecoder::new( pages, data_type, - dictionary::DictionaryDecoder::new(primitive::PrimitiveDecoder::closure(|x: i64| { - x * a - })), + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::closure(|x: i64| x * a)), )? .collect_n(filter), (a, false) => PageDecoder::new( pages, data_type, - dictionary::DictionaryDecoder::new(primitive::PrimitiveDecoder::closure(|x: i64| { - x / a - })), + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::closure(|x: i64| x / a)), )? .collect_n(filter), } @@ -524,132 +520,99 @@ fn dict_read( panic!() }; - Ok( - match (physical_type, values_data_type.to_logical_type()) { - (PhysicalType::Int32, UInt8) => PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - )? - .collect_n(filter)?, - (PhysicalType::Int32, UInt16) => PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - )? - .collect_n(filter)?, - (PhysicalType::Int32, UInt32) => PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - )? - .collect_n(filter)?, - (PhysicalType::Int64, UInt64) => PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - )? - .collect_n(filter)?, - (PhysicalType::Int32, Int8) => PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), - )? - .collect_n(filter)?, - (PhysicalType::Int32, Int16) => PageDecoder::new( + Ok(match (physical_type, values_data_type.to_logical_type()) { + (PhysicalType::Int32, UInt8) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::cast_as()), + )? + .collect_n(filter)?, + (PhysicalType::Int32, UInt16) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::cast_as()), + )? + .collect_n(filter)?, + (PhysicalType::Int32, UInt32) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::cast_as()), + )? + .collect_n(filter)?, + (PhysicalType::Int64, UInt64) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::cast_as()), + )? + .collect_n(filter)?, + (PhysicalType::Int32, Int8) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::cast_as()), + )? + .collect_n(filter)?, + (PhysicalType::Int32, Int16) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::cast_as()), + )? + .collect_n(filter)?, + (PhysicalType::Int32, Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth)) => { + PageDecoder::new( iter, data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::cast_as(), - ), + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::unit()), )? - .collect_n(filter)?, - ( - PhysicalType::Int32, - Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth), - ) => { - PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::unit(), - ), - )? - .collect_n(filter)? - }, - - (PhysicalType::Int64, Timestamp(time_unit, _)) => { - let time_unit = *time_unit; - return timestamp_dict::( - iter, - physical_type, - logical_type, - data_type, - filter, - time_unit, - ); - }, + .collect_n(filter)? + }, - (PhysicalType::Int64, Int64 | Date64 | Time64(_) | Duration(_)) => { - PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::unit(), - ), - )? - .collect_n(filter)? - }, - (PhysicalType::Float, Float32) => { - PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::unit(), - ), - )? - .collect_n(filter)? - }, - (PhysicalType::Double, Float64) => { - PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new( - primitive::PrimitiveDecoder::::unit(), - ), - )? - .collect_n(filter)? - }, - (_, LargeUtf8 | LargeBinary | Utf8 | Binary) => unreachable!(), - (PhysicalType::ByteArray, Utf8View | BinaryView) => PageDecoder::new( - iter, - data_type, - dictionary::DictionaryDecoder::new(BinViewDecoder::default()), - )? - .collect_n(filter)?, - (PhysicalType::FixedLenByteArray(size), FixedSizeBinary(_)) => PageDecoder::new( + (PhysicalType::Int64, Timestamp(time_unit, _)) => { + let time_unit = *time_unit; + return timestamp_dict::( iter, + physical_type, + logical_type, data_type, - dictionary::DictionaryDecoder::new(fixed_size_binary::BinaryDecoder { - size: *size, - }), - )? - .collect_n(filter)?, - other => { - return Err(ParquetError::FeatureNotSupported(format!( - "Reading dictionaries of type {other:?}" - ))); - }, + filter, + time_unit, + ); }, - ) + + (PhysicalType::Int64, Int64 | Date64 | Time64(_) | Duration(_)) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::unit()), + )? + .collect_n(filter)?, + (PhysicalType::Float, Float32) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::unit()), + )? + .collect_n(filter)?, + (PhysicalType::Double, Float64) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(primitive::FloatDecoder::::unit()), + )? + .collect_n(filter)?, + (_, LargeUtf8 | LargeBinary | Utf8 | Binary) => unreachable!(), + (PhysicalType::ByteArray, Utf8View | BinaryView) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(BinViewDecoder::default()), + )? + .collect_n(filter)?, + (PhysicalType::FixedLenByteArray(size), FixedSizeBinary(_)) => PageDecoder::new( + iter, + data_type, + dictionary::DictionaryDecoder::new(fixed_size_binary::BinaryDecoder { size: *size }), + )? + .collect_n(filter)?, + other => { + return Err(ParquetError::FeatureNotSupported(format!( + "Reading dictionaries of type {other:?}" + ))); + }, + }) } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs index 7a18a0c16a85..c1dc1324bb27 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs @@ -19,6 +19,7 @@ use crate::parquet::schema::Repetition; #[derive(Debug)] pub(crate) struct State<'a, D: Decoder> { pub(crate) dict: Option<&'a D::Dict>, + pub(crate) is_optional: bool, pub(crate) page_validity: Option>, pub(crate) translation: D::Translation<'a>, } @@ -41,6 +42,7 @@ pub(crate) trait StateTranslation<'a, D: Decoder>: Sized { &mut self, decoder: &mut D, decoded: &mut D::DecodedState, + is_optional: bool, page_validity: &mut Option>, dict: Option<&'a D::Dict>, additional: usize, @@ -52,14 +54,25 @@ impl<'a, D: Decoder> State<'a, D> { let is_optional = page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; - let page_validity = is_optional + let mut page_validity = is_optional .then(|| page_validity_decoder(page)) .transpose()?; + // Make the page_validity None if there are no nulls in the page + let null_count = page + .null_count() + .map(Ok) + .or_else(|| page_validity.as_ref().map(hybrid_rle_count_zeros)) + .transpose()?; + if null_count == Some(0) { + page_validity = None; + } + let translation = D::Translation::new(decoder, page, dict, page_validity.as_ref())?; Ok(Self { dict, + is_optional, page_validity, translation, }) @@ -75,6 +88,9 @@ impl<'a, D: Decoder> State<'a, D> { Ok(Self { dict, translation, + + // Nested values may be optional, but all that is handled elsewhere. + is_optional: false, page_validity: None, }) } @@ -120,6 +136,7 @@ impl<'a, D: Decoder> State<'a, D> { self.translation.extend_from_state( decoder, decoded, + self.is_optional, &mut self.page_validity, self.dict, num_rows, @@ -137,6 +154,7 @@ impl<'a, D: Decoder> State<'a, D> { self.translation.extend_from_state( decoder, decoded, + self.is_optional, &mut self.page_validity, self.dict, end - start, @@ -158,6 +176,7 @@ impl<'a, D: Decoder> State<'a, D> { self.translation.extend_from_state( decoder, decoded, + self.is_optional, &mut self.page_validity, self.dict, num_ones, @@ -586,7 +605,7 @@ pub(super) trait Decoder: Sized { fn with_capacity(&self, capacity: usize) -> Self::DecodedState; /// Deserializes a [`DictPage`] into [`Self::Dict`]. - fn deserialize_dict(&self, page: DictPage) -> Self::Dict; + fn deserialize_dict(&self, page: DictPage) -> ParquetResult; fn apply_dictionary( &mut self, @@ -600,6 +619,7 @@ pub(super) trait Decoder: Sized { &mut self, decoded: &mut Self::DecodedState, page_values: &mut as StateTranslation<'a, Self>>::PlainDecoder, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, limit: usize, ) -> ParquetResult<()>; @@ -607,6 +627,7 @@ pub(super) trait Decoder: Sized { &mut self, decoded: &mut Self::DecodedState, page_values: &mut HybridRleDecoder<'a>, + is_optional: bool, page_validity: Option<&mut PageValidity<'a>>, dict: &Self::Dict, limit: usize, @@ -675,7 +696,7 @@ impl PageDecoder { decoder: D, ) -> ParquetResult { let dict_page = iter.read_dict_page()?; - let dict = dict_page.map(|d| decoder.deserialize_dict(d)); + let dict = dict_page.map(|d| decoder.deserialize_dict(d)).transpose()?; Ok(Self { iter, diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index 9c445d7a46ce..b3e96d9254f3 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -48,7 +48,7 @@ pub fn get_field_pages<'a, T>( columns .iter() .zip(items) - .filter(|(metadata, _)| metadata.descriptor().path_in_schema[0] == field_name) + .filter(|(metadata, _)| metadata.descriptor().path_in_schema[0].as_str() == field_name) .map(|(_, item)| item) .collect() } diff --git a/crates/polars-parquet/src/arrow/read/schema/convert.rs b/crates/polars-parquet/src/arrow/read/schema/convert.rs index 2089e261188f..642cdbf506cd 100644 --- a/crates/polars-parquet/src/arrow/read/schema/convert.rs +++ b/crates/polars-parquet/src/arrow/read/schema/convert.rs @@ -1,5 +1,6 @@ //! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`]. use arrow::datatypes::{ArrowDataType, Field, IntervalUnit, TimeUnit}; +use polars_utils::pl_str::PlSmallStr; use crate::arrow::read::schema::SchemaInferenceOptions; use crate::parquet::schema::types::{ @@ -91,7 +92,7 @@ fn from_int64( let timezone = if is_adjusted_to_utc { // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md // A TIMESTAMP with isAdjustedToUTC=true is defined as [...] elapsed since the Unix epoch - Some("+00:00".to_string()) + Some(PlSmallStr::from_static("+00:00")) } else { // PARQUET: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md @@ -222,7 +223,7 @@ fn to_primitive_type( if primitive_type.field_info.repetition == Repetition::Repeated { ArrowDataType::LargeList(Box::new(Field::new( - &primitive_type.field_info.name, + primitive_type.field_info.name.clone(), base_type, is_nullable(&primitive_type.field_info), ))) @@ -285,7 +286,7 @@ fn to_group_type( debug_assert!(!fields.is_empty()); if field_info.repetition == Repetition::Repeated { Some(ArrowDataType::LargeList(Box::new(Field::new( - &field_info.name, + field_info.name.clone(), to_struct(fields, options)?, is_nullable(field_info), )))) @@ -308,7 +309,7 @@ pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool { /// i.e. if it is a column-less group type. fn to_field(type_: &ParquetType, options: &SchemaInferenceOptions) -> Option { Some(Field::new( - &type_.get_field_info().name, + type_.get_field_info().name.clone(), to_data_type(type_, options)?, is_nullable(type_.get_field_info()), )) @@ -348,15 +349,15 @@ fn to_list( let (list_item_name, item_is_optional) = match item { ParquetType::GroupType { field_info, fields, .. - } if field_info.name == "list" && fields.len() == 1 => { + } if field_info.name.as_str() == "list" && fields.len() == 1 => { let field = fields.first().unwrap(); ( - &field.get_field_info().name, + field.get_field_info().name.clone(), field.get_field_info().repetition == Repetition::Optional, ) }, _ => ( - &item.get_field_info().name, + item.get_field_info().name.clone(), item.get_field_info().repetition == Repetition::Optional, ), }; @@ -397,7 +398,7 @@ pub(crate) fn to_data_type( logical_type, converted_type, fields, - &field_info.name, + field_info.name.as_str(), options, ) } @@ -430,17 +431,17 @@ mod tests { } "; let expected = &[ - Field::new("boolean", ArrowDataType::Boolean, false), - Field::new("int8", ArrowDataType::Int8, false), - Field::new("int16", ArrowDataType::Int16, false), - Field::new("uint8", ArrowDataType::UInt8, false), - Field::new("uint16", ArrowDataType::UInt16, false), - Field::new("int32", ArrowDataType::Int32, false), - Field::new("int64", ArrowDataType::Int64, false), - Field::new("double", ArrowDataType::Float64, true), - Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::Utf8View, true), - Field::new("string_2", ArrowDataType::Utf8View, true), + Field::new("boolean".into(), ArrowDataType::Boolean, false), + Field::new("int8".into(), ArrowDataType::Int8, false), + Field::new("int16".into(), ArrowDataType::Int16, false), + Field::new("uint8".into(), ArrowDataType::UInt8, false), + Field::new("uint16".into(), ArrowDataType::UInt16, false), + Field::new("int32".into(), ArrowDataType::Int32, false), + Field::new("int64".into(), ArrowDataType::Int64, false), + Field::new("double".into(), ArrowDataType::Float64, true), + Field::new("float".into(), ArrowDataType::Float32, true), + Field::new("string".into(), ArrowDataType::Utf8View, true), + Field::new("string_2".into(), ArrowDataType::Utf8View, true), ]; let parquet_schema = SchemaDescriptor::try_from_message(message)?; @@ -459,8 +460,12 @@ mod tests { } "; let expected = vec![ - Field::new("binary", ArrowDataType::BinaryView, false), - Field::new("fixed_binary", ArrowDataType::FixedSizeBinary(20), false), + Field::new("binary".into(), ArrowDataType::BinaryView, false), + Field::new( + "fixed_binary".into(), + ArrowDataType::FixedSizeBinary(20), + false, + ), ]; let parquet_schema = SchemaDescriptor::try_from_message(message)?; @@ -479,8 +484,8 @@ mod tests { } "; let expected = &[ - Field::new("boolean", ArrowDataType::Boolean, false), - Field::new("int8", ArrowDataType::Int8, false), + Field::new("boolean".into(), ArrowDataType::Boolean, false), + Field::new("int8".into(), ArrowDataType::Int8, false), ]; let parquet_schema = SchemaDescriptor::try_from_message(message)?; @@ -554,9 +559,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list", + "my_list".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Utf8, true, ))), @@ -572,9 +577,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list", + "my_list".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Utf8, false, ))), @@ -596,13 +601,17 @@ mod tests { // } { let arrow_inner_list = ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Int32, false, ))); arrow_fields.push(Field::new( - "array_of_arrays", - ArrowDataType::LargeList(Box::new(Field::new("element", arrow_inner_list, false))), + "array_of_arrays".into(), + ArrowDataType::LargeList(Box::new(Field::new( + PlSmallStr::from_static("element"), + arrow_inner_list, + false, + ))), true, )); } @@ -615,9 +624,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list", + "my_list".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Utf8, false, ))), @@ -631,9 +640,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list", + "my_list".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Int32, false, ))), @@ -650,12 +659,16 @@ mod tests { // } { let arrow_struct = ArrowDataType::Struct(vec![ - Field::new("str", ArrowDataType::Utf8, false), - Field::new("num", ArrowDataType::Int32, false), + Field::new("str".into(), ArrowDataType::Utf8, false), + Field::new("num".into(), ArrowDataType::Int32, false), ]); arrow_fields.push(Field::new( - "my_list", - ArrowDataType::LargeList(Box::new(Field::new("element", arrow_struct, false))), + "my_list".into(), + ArrowDataType::LargeList(Box::new(Field::new( + "element".into(), + arrow_struct, + false, + ))), true, )); } @@ -669,10 +682,10 @@ mod tests { // Special case: group is named array { let arrow_struct = - ArrowDataType::Struct(vec![Field::new("str", ArrowDataType::Utf8, false)]); + ArrowDataType::Struct(vec![Field::new("str".into(), ArrowDataType::Utf8, false)]); arrow_fields.push(Field::new( - "my_list", - ArrowDataType::LargeList(Box::new(Field::new("array", arrow_struct, false))), + "my_list".into(), + ArrowDataType::LargeList(Box::new(Field::new("array".into(), arrow_struct, false))), true, )); } @@ -686,11 +699,11 @@ mod tests { // Special case: group named ends in _tuple { let arrow_struct = - ArrowDataType::Struct(vec![Field::new("str", ArrowDataType::Utf8, false)]); + ArrowDataType::Struct(vec![Field::new("str".into(), ArrowDataType::Utf8, false)]); arrow_fields.push(Field::new( - "my_list", + "my_list".into(), ArrowDataType::LargeList(Box::new(Field::new( - "my_list_tuple", + "my_list_tuple".into(), arrow_struct, false, ))), @@ -702,8 +715,12 @@ mod tests { // repeated value_type name { arrow_fields.push(Field::new( - "name", - ArrowDataType::LargeList(Box::new(Field::new("name", ArrowDataType::Int32, false))), + "name".into(), + ArrowDataType::LargeList(Box::new(Field::new( + "name".into(), + ArrowDataType::Int32, + false, + ))), false, )); } @@ -732,17 +749,17 @@ mod tests { { let struct_fields = vec![ - Field::new("event_name", ArrowDataType::Utf8View, false), + Field::new("event_name".into(), ArrowDataType::Utf8View, false), Field::new( - "event_time", + "event_time".into(), ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), false, ), ]; arrow_fields.push(Field::new( - "events", + "events".into(), ArrowDataType::LargeList(Box::new(Field::new( - "array", + "array".into(), ArrowDataType::Struct(struct_fields), false, ))), @@ -789,9 +806,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list1", + "my_list1".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Utf8View, true, ))), @@ -807,9 +824,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list2", + "my_list2".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Utf8View, false, ))), @@ -825,9 +842,9 @@ mod tests { // } { arrow_fields.push(Field::new( - "my_list3", + "my_list3".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Utf8View, false, ))), @@ -847,13 +864,14 @@ mod tests { let mut arrow_fields = Vec::new(); { let group1_fields = vec![ - Field::new("leaf1", ArrowDataType::Boolean, false), - Field::new("leaf2", ArrowDataType::Int32, false), + Field::new("leaf1".into(), ArrowDataType::Boolean, false), + Field::new("leaf2".into(), ArrowDataType::Int32, false), ]; - let group1_struct = Field::new("group1", ArrowDataType::Struct(group1_fields), false); + let group1_struct = + Field::new("group1".into(), ArrowDataType::Struct(group1_fields), false); arrow_fields.push(group1_struct); - let leaf3_field = Field::new("leaf3", ArrowDataType::Int64, false); + let leaf3_field = Field::new("leaf3".into(), ArrowDataType::Int64, false); arrow_fields.push(leaf3_field); } @@ -879,24 +897,28 @@ mod tests { fn test_repeated_nested_schema() -> PolarsResult<()> { let mut arrow_fields = Vec::new(); { - arrow_fields.push(Field::new("leaf1", ArrowDataType::Int32, true)); + arrow_fields.push(Field::new("leaf1".into(), ArrowDataType::Int32, true)); let inner_group_list = Field::new( - "innerGroup", + "innerGroup".into(), ArrowDataType::LargeList(Box::new(Field::new( - "innerGroup", - ArrowDataType::Struct(vec![Field::new("leaf3", ArrowDataType::Int32, true)]), + "innerGroup".into(), + ArrowDataType::Struct(vec![Field::new( + "leaf3".into(), + ArrowDataType::Int32, + true, + )]), false, ))), false, ); let outer_group_list = Field::new( - "outerGroup", + "outerGroup".into(), ArrowDataType::LargeList(Box::new(Field::new( - "outerGroup", + "outerGroup".into(), ArrowDataType::Struct(vec![ - Field::new("leaf2", ArrowDataType::Int32, true), + Field::new("leaf2".into(), ArrowDataType::Int32, true), inner_group_list, ]), false, @@ -951,54 +973,54 @@ mod tests { } "; let arrow_fields = vec![ - Field::new("boolean", ArrowDataType::Boolean, false), - Field::new("int8", ArrowDataType::Int8, false), - Field::new("uint8", ArrowDataType::UInt8, false), - Field::new("int16", ArrowDataType::Int16, false), - Field::new("uint16", ArrowDataType::UInt16, false), - Field::new("int32", ArrowDataType::Int32, false), - Field::new("int64", ArrowDataType::Int64, false), - Field::new("double", ArrowDataType::Float64, true), - Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::Utf8, true), + Field::new("boolean".into(), ArrowDataType::Boolean, false), + Field::new("int8".into(), ArrowDataType::Int8, false), + Field::new("uint8".into(), ArrowDataType::UInt8, false), + Field::new("int16".into(), ArrowDataType::Int16, false), + Field::new("uint16".into(), ArrowDataType::UInt16, false), + Field::new("int32".into(), ArrowDataType::Int32, false), + Field::new("int64".into(), ArrowDataType::Int64, false), + Field::new("double".into(), ArrowDataType::Float64, true), + Field::new("float".into(), ArrowDataType::Float32, true), + Field::new("string".into(), ArrowDataType::Utf8, true), Field::new( - "bools", + "bools".into(), ArrowDataType::LargeList(Box::new(Field::new( - "bools", + "bools".into(), ArrowDataType::Boolean, false, ))), false, ), - Field::new("date", ArrowDataType::Date32, true), + Field::new("date".into(), ArrowDataType::Date32, true), Field::new( - "time_milli", + "time_milli".into(), ArrowDataType::Time32(TimeUnit::Millisecond), true, ), Field::new( - "time_micro", + "time_micro".into(), ArrowDataType::Time64(TimeUnit::Microsecond), true, ), Field::new( - "time_nano", + "time_nano".into(), ArrowDataType::Time64(TimeUnit::Nanosecond), true, ), Field::new( - "ts_milli", + "ts_milli".into(), ArrowDataType::Timestamp(TimeUnit::Millisecond, None), true, ), Field::new( - "ts_micro", + "ts_micro".into(), ArrowDataType::Timestamp(TimeUnit::Microsecond, None), false, ), Field::new( - "ts_nano", - ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())), + "ts_nano".into(), + ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), false, ), ]; @@ -1051,62 +1073,62 @@ mod tests { "; let arrow_fields = vec![ - Field::new("boolean", ArrowDataType::Boolean, false), - Field::new("int8", ArrowDataType::Int8, false), - Field::new("int16", ArrowDataType::Int16, false), - Field::new("int32", ArrowDataType::Int32, false), - Field::new("int64", ArrowDataType::Int64, false), - Field::new("double", ArrowDataType::Float64, true), - Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::Utf8View, true), + Field::new("boolean".into(), ArrowDataType::Boolean, false), + Field::new("int8".into(), ArrowDataType::Int8, false), + Field::new("int16".into(), ArrowDataType::Int16, false), + Field::new("int32".into(), ArrowDataType::Int32, false), + Field::new("int64".into(), ArrowDataType::Int64, false), + Field::new("double".into(), ArrowDataType::Float64, true), + Field::new("float".into(), ArrowDataType::Float32, true), + Field::new("string".into(), ArrowDataType::Utf8View, true), Field::new( - "bools", + "bools".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Boolean, true, ))), true, ), Field::new( - "bools_non_null", + "bools_non_null".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Boolean, false, ))), false, ), - Field::new("date", ArrowDataType::Date32, true), + Field::new("date".into(), ArrowDataType::Date32, true), Field::new( - "time_milli", + "time_milli".into(), ArrowDataType::Time32(TimeUnit::Millisecond), true, ), Field::new( - "time_micro", + "time_micro".into(), ArrowDataType::Time64(TimeUnit::Microsecond), true, ), Field::new( - "ts_milli", + "ts_milli".into(), ArrowDataType::Timestamp(TimeUnit::Millisecond, None), true, ), Field::new( - "ts_micro", + "ts_micro".into(), ArrowDataType::Timestamp(TimeUnit::Microsecond, None), false, ), Field::new( - "struct", + "struct".into(), ArrowDataType::Struct(vec![ - Field::new("bools", ArrowDataType::Boolean, false), - Field::new("uint32", ArrowDataType::UInt32, false), + Field::new("bools".into(), ArrowDataType::Boolean, false), + Field::new("uint32".into(), ArrowDataType::UInt32, false), Field::new( - "int32", + "int32".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), ArrowDataType::Int32, true, ))), @@ -1115,7 +1137,7 @@ mod tests { ]), false, ), - Field::new("dictionary_strings", ArrowDataType::Utf8View, false), + Field::new("dictionary_strings".into(), ArrowDataType::Utf8View, false), ]; let parquet_schema = SchemaDescriptor::try_from_message(message_type)?; @@ -1148,20 +1170,20 @@ mod tests { "; let coerced_to = ArrowDataType::Timestamp(tu, None); let arrow_fields = vec![ - Field::new("int96_field", coerced_to.clone(), false), + Field::new("int96_field".into(), coerced_to.clone(), false), Field::new( - "int96_list", + "int96_list".into(), ArrowDataType::LargeList(Box::new(Field::new( - "element", + "element".into(), coerced_to.clone(), true, ))), true, ), Field::new( - "int96_struct", + "int96_struct".into(), ArrowDataType::Struct(vec![Field::new( - "int96_field", + "int96_field".into(), coerced_to.clone(), false, )]), diff --git a/crates/polars-parquet/src/arrow/read/schema/metadata.rs b/crates/polars-parquet/src/arrow/read/schema/metadata.rs index d18de8ee4a58..d90d270b9666 100644 --- a/crates/polars-parquet/src/arrow/read/schema/metadata.rs +++ b/crates/polars-parquet/src/arrow/read/schema/metadata.rs @@ -3,6 +3,7 @@ use arrow::io::ipc::read::deserialize_schema; use base64::engine::general_purpose; use base64::Engine as _; use polars_error::{polars_bail, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use super::super::super::ARROW_SCHEMA_META_KEY; pub use crate::parquet::metadata::KeyValue; @@ -86,9 +87,12 @@ pub(super) fn parse_key_value_metadata(key_value_metadata: &Option key_values .iter() .filter_map(|kv| { - kv.value - .as_ref() - .map(|value| (kv.key.clone(), value.clone())) + kv.value.as_ref().map(|value| { + ( + PlSmallStr::from_str(kv.key.as_str()), + PlSmallStr::from_str(value.as_str()), + ) + }) }) .collect() }) diff --git a/crates/polars-parquet/src/arrow/read/statistics/mod.rs b/crates/polars-parquet/src/arrow/read/statistics/mod.rs index 0face3c8b358..56052018fa12 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/mod.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/mod.rs @@ -198,22 +198,30 @@ fn create_dt(data_type: &ArrowDataType) -> ArrowDataType { ArrowDataType::Struct(fields) => ArrowDataType::Struct( fields .iter() - .map(|f| Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)) + .map(|f| Field::new(f.name.clone(), create_dt(&f.data_type), f.is_nullable)) .collect(), ), ArrowDataType::Map(f, ordered) => ArrowDataType::Map( - Box::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)), + Box::new(Field::new( + f.name.clone(), + create_dt(&f.data_type), + f.is_nullable, + )), *ordered, ), ArrowDataType::LargeList(f) => ArrowDataType::LargeList(Box::new(Field::new( - &f.name, + f.name.clone(), create_dt(&f.data_type), f.is_nullable, ))), // FixedSizeList piggy backs on list - ArrowDataType::List(f) | ArrowDataType::FixedSizeList(f, _) => ArrowDataType::List( - Box::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)), - ), + ArrowDataType::List(f) | ArrowDataType::FixedSizeList(f, _) => { + ArrowDataType::List(Box::new(Field::new( + f.name.clone(), + create_dt(&f.data_type), + f.is_nullable, + ))) + }, _ => ArrowDataType::UInt64, } } diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index abdaab87bb3f..c76471f783bd 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -32,6 +32,7 @@ use arrow::datatypes::*; use arrow::types::{days_ms, i256, NativeType}; pub use nested::{num_values, write_rep_and_def}; pub use pages::{to_leaves, to_nested, to_parquet_leaves}; +use polars_utils::pl_str::PlSmallStr; pub use utils::write_def_levels; pub use crate::parquet::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; @@ -195,7 +196,10 @@ pub fn to_parquet_schema(schema: &ArrowSchema) -> PolarsResult .iter() .map(to_parquet_type) .collect::>>()?; - Ok(SchemaDescriptor::new("root".to_string(), parquet_types)) + Ok(SchemaDescriptor::new( + PlSmallStr::from_static("root"), + parquet_types, + )) } /// Slices the [`Array`] to `Box` and `Vec`. diff --git a/crates/polars-parquet/src/arrow/write/pages.rs b/crates/polars-parquet/src/arrow/write/pages.rs index 6e663a4d389e..6196ee998a02 100644 --- a/crates/polars-parquet/src/arrow/write/pages.rs +++ b/crates/polars-parquet/src/arrow/write/pages.rs @@ -149,7 +149,7 @@ fn to_nested_recursive( fields } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a struct array".to_string(), + "Parquet type must be a group for a struct array", ) }; @@ -170,12 +170,12 @@ fn to_nested_recursive( &fields[0] } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a list array".to_string(), + "Parquet type must be a group for a list array", ) } } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a list array".to_string(), + "Parquet type must be a group for a list array", ) }; @@ -194,12 +194,12 @@ fn to_nested_recursive( &fields[0] } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a list array".to_string(), + "Parquet type must be a group for a list array", ) } } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a list array".to_string(), + "Parquet type must be a group for a list array", ) }; @@ -217,12 +217,12 @@ fn to_nested_recursive( &fields[0] } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a list array".to_string(), + "Parquet type must be a group for a list array", ) } } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a list array".to_string(), + "Parquet type must be a group for a list array", ) }; @@ -240,12 +240,12 @@ fn to_nested_recursive( &fields[0] } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a map array".to_string(), + "Parquet type must be a group for a map array", ) } } else { polars_bail!(InvalidOperation: - "Parquet type must be a group for a map array".to_string(), + "Parquet type must be a group for a map array", ) }; @@ -594,8 +594,8 @@ mod tests { let int = Int32Array::from_slice([42, 28, 19, 31]).boxed(); let fields = vec![ - Field::new("b", ArrowDataType::Boolean, false), - Field::new("c", ArrowDataType::Int32, false), + Field::new("b".into(), ArrowDataType::Boolean, false), + Field::new("c".into(), ArrowDataType::Int32, false), ]; let array = StructArray::new( @@ -606,7 +606,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "a".to_string(), + name: "a".into(), repetition: Repetition::Optional, id: None, }, @@ -615,7 +615,7 @@ mod tests { fields: vec![ ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "b".to_string(), + name: "b".into(), repetition: Repetition::Required, id: None, }, @@ -625,7 +625,7 @@ mod tests { }), ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "c".to_string(), + name: "c".into(), repetition: Repetition::Required, id: None, }, @@ -658,8 +658,8 @@ mod tests { let int = Int32Array::from_slice([42, 28, 19, 31]).boxed(); let fields = vec![ - Field::new("b", ArrowDataType::Boolean, false), - Field::new("c", ArrowDataType::Int32, false), + Field::new("b".into(), ArrowDataType::Boolean, false), + Field::new("c".into(), ArrowDataType::Int32, false), ]; let array = StructArray::new( @@ -669,8 +669,8 @@ mod tests { ); let fields = vec![ - Field::new("b", array.data_type().clone(), true), - Field::new("c", array.data_type().clone(), true), + Field::new("b".into(), array.data_type().clone(), true), + Field::new("c".into(), array.data_type().clone(), true), ]; let array = StructArray::new( @@ -681,7 +681,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "a".to_string(), + name: "a".into(), repetition: Repetition::Optional, id: None, }, @@ -690,7 +690,7 @@ mod tests { fields: vec![ ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "b".to_string(), + name: "b".into(), repetition: Repetition::Required, id: None, }, @@ -700,7 +700,7 @@ mod tests { }), ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "c".to_string(), + name: "c".into(), repetition: Repetition::Required, id: None, }, @@ -713,7 +713,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "a".to_string(), + name: "a".into(), repetition: Repetition::Required, id: None, }, @@ -761,8 +761,8 @@ mod tests { let int = Int32Array::from_slice([42, 28, 19, 31]).boxed(); let fields = vec![ - Field::new("b", ArrowDataType::Boolean, false), - Field::new("c", ArrowDataType::Int32, false), + Field::new("b".into(), ArrowDataType::Boolean, false), + Field::new("c".into(), ArrowDataType::Int32, false), ]; let array = StructArray::new( @@ -772,7 +772,11 @@ mod tests { ); let array = ListArray::new( - ArrowDataType::List(Box::new(Field::new("l", array.data_type().clone(), true))), + ArrowDataType::List(Box::new(Field::new( + "l".into(), + array.data_type().clone(), + true, + ))), vec![0i32, 2, 4].try_into().unwrap(), Box::new(array), None, @@ -780,7 +784,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "a".to_string(), + name: "a".into(), repetition: Repetition::Optional, id: None, }, @@ -789,7 +793,7 @@ mod tests { fields: vec![ ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "b".to_string(), + name: "b".into(), repetition: Repetition::Required, id: None, }, @@ -799,7 +803,7 @@ mod tests { }), ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "c".to_string(), + name: "c".into(), repetition: Repetition::Required, id: None, }, @@ -812,7 +816,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "l".to_string(), + name: "l".into(), repetition: Repetition::Required, id: None, }, @@ -820,7 +824,7 @@ mod tests { converted_type: None, fields: vec![ParquetType::GroupType { field_info: FieldInfo { - name: "list".to_string(), + name: "list".into(), repetition: Repetition::Repeated, id: None, }, @@ -860,10 +864,10 @@ mod tests { #[test] fn test_map() { let kv_type = ArrowDataType::Struct(vec![ - Field::new("k", ArrowDataType::Utf8, false), - Field::new("v", ArrowDataType::Int32, false), + Field::new("k".into(), ArrowDataType::Utf8, false), + Field::new("v".into(), ArrowDataType::Int32, false), ]); - let kv_field = Field::new("kv", kv_type.clone(), false); + let kv_field = Field::new("kv".into(), kv_type.clone(), false); let map_type = ArrowDataType::Map(Box::new(kv_field), false); let key_array = Utf8Array::::from_slice(["k1", "k2", "k3", "k4", "k5", "k6"]).boxed(); @@ -877,7 +881,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "kv".to_string(), + name: "kv".into(), repetition: Repetition::Optional, id: None, }, @@ -886,7 +890,7 @@ mod tests { fields: vec![ ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "k".to_string(), + name: "k".into(), repetition: Repetition::Required, id: None, }, @@ -896,7 +900,7 @@ mod tests { }), ParquetType::PrimitiveType(ParquetPrimitiveType { field_info: FieldInfo { - name: "v".to_string(), + name: "v".into(), repetition: Repetition::Required, id: None, }, @@ -909,7 +913,7 @@ mod tests { let type_ = ParquetType::GroupType { field_info: FieldInfo { - name: "m".to_string(), + name: "m".into(), repetition: Repetition::Required, id: None, }, @@ -917,7 +921,7 @@ mod tests { converted_type: None, fields: vec![ParquetType::GroupType { field_info: FieldInfo { - name: "map".to_string(), + name: "map".into(), repetition: Repetition::Repeated, id: None, }, diff --git a/crates/polars-parquet/src/arrow/write/schema.rs b/crates/polars-parquet/src/arrow/write/schema.rs index 047291770180..171b79cf3d24 100644 --- a/crates/polars-parquet/src/arrow/write/schema.rs +++ b/crates/polars-parquet/src/arrow/write/schema.rs @@ -3,6 +3,7 @@ use arrow::io::ipc::write::{default_ipc_fields, schema_to_bytes}; use base64::engine::general_purpose; use base64::Engine as _; use polars_error::{polars_bail, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use super::super::ARROW_SCHEMA_META_KEY; use crate::arrow::write::decimal_length_from_precision; @@ -303,7 +304,7 @@ pub fn to_parquet_type(field: &Field) -> PolarsResult { )) }, ArrowDataType::Dictionary(_, value, _) => { - let dict_field = Field::new(name.as_str(), value.as_ref().clone(), field.is_nullable); + let dict_field = Field::new(name.clone(), value.as_ref().clone(), field.is_nullable); to_parquet_type(&dict_field) }, ArrowDataType::FixedSizeBinary(size) => Ok(ParquetType::try_from_primitive( @@ -392,7 +393,7 @@ pub fn to_parquet_type(field: &Field) -> PolarsResult { | ArrowDataType::FixedSizeList(f, _) | ArrowDataType::LargeList(f) => { let mut f = f.clone(); - f.name = "element".to_string(); + f.name = PlSmallStr::from_static("element"); Ok(ParquetType::from_group( name, @@ -400,7 +401,7 @@ pub fn to_parquet_type(field: &Field) -> PolarsResult { Some(GroupConvertedType::List), Some(GroupLogicalType::List), vec![ParquetType::from_group( - "list".to_string(), + PlSmallStr::from_static("list"), Repetition::Repeated, None, None, @@ -416,7 +417,7 @@ pub fn to_parquet_type(field: &Field) -> PolarsResult { Some(GroupConvertedType::Map), Some(GroupLogicalType::Map), vec![ParquetType::from_group( - "map".to_string(), + PlSmallStr::from_static("map"), Repetition::Repeated, None, None, diff --git a/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs b/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs index 2c9a0d1f6e48..035ba32ad002 100644 --- a/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs +++ b/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs @@ -1,3 +1,4 @@ +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde_types")] use serde::{Deserialize, Serialize}; @@ -28,7 +29,7 @@ pub struct ColumnDescriptor { pub descriptor: Descriptor, /// The path of this column. For instance, "a.b.c.d". - pub path_in_schema: Vec, + pub path_in_schema: Vec, /// The [`ParquetType`] this descriptor is a leaf of pub base_type: ParquetType, @@ -38,7 +39,7 @@ impl ColumnDescriptor { /// Creates new descriptor for leaf-level column. pub fn new( descriptor: Descriptor, - path_in_schema: Vec, + path_in_schema: Vec, base_type: ParquetType, ) -> Self { Self { diff --git a/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs b/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs index 734ee054aebe..7c29f983ee1d 100644 --- a/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs +++ b/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs @@ -1,4 +1,5 @@ use parquet_format_safe::SchemaElement; +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde_types")] use serde::{Deserialize, Serialize}; @@ -13,7 +14,7 @@ use crate::parquet::schema::Repetition; #[derive(Debug, Clone)] #[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] pub struct SchemaDescriptor { - name: String, + name: PlSmallStr, // The top-level schema (the "message" type). fields: Vec, @@ -24,7 +25,7 @@ pub struct SchemaDescriptor { impl SchemaDescriptor { /// Creates new schema descriptor from Parquet schema. - pub fn new(name: String, fields: Vec) -> Self { + pub fn new(name: PlSmallStr, fields: Vec) -> Self { let mut leaves = vec![]; for f in &fields { let mut path = vec![]; @@ -113,7 +114,7 @@ fn build_tree<'a>( match tp { ParquetType::PrimitiveType(p) => { - let path_in_schema = path_so_far.iter().copied().map(String::from).collect(); + let path_in_schema = path_so_far.iter().copied().map(Into::into).collect(); leaves.push(ColumnDescriptor::new( Descriptor { primitive_type: p.clone(), diff --git a/crates/polars-parquet/src/parquet/page/mod.rs b/crates/polars-parquet/src/parquet/page/mod.rs index 128f1af03c14..400bdfc4a0f7 100644 --- a/crates/polars-parquet/src/parquet/page/mod.rs +++ b/crates/polars-parquet/src/parquet/page/mod.rs @@ -123,6 +123,13 @@ impl DataPageHeader { DataPageHeader::V2(d) => d.num_values as usize, } } + + pub fn null_count(&self) -> Option { + match &self { + DataPageHeader::V1(_) => None, + DataPageHeader::V2(d) => Some(d.num_nulls as usize), + } + } } /// A [`DataPage`] is an uncompressed, encoded representation of a Parquet data page. It holds actual data @@ -181,6 +188,10 @@ impl DataPage { self.header.num_values() } + pub fn null_count(&self) -> Option { + self.header.null_count() + } + pub fn num_rows(&self) -> Option { self.num_rows } @@ -258,13 +269,6 @@ pub enum CompressedPage { } impl CompressedPage { - pub(crate) fn buffer(&self) -> &[u8] { - match self { - CompressedPage::Data(page) => &page.buffer, - CompressedPage::Dict(page) => &page.buffer, - } - } - pub(crate) fn buffer_mut(&mut self) -> &mut Vec { match self { CompressedPage::Data(page) => page.buffer.to_mut(), @@ -292,13 +296,6 @@ impl CompressedPage { CompressedPage::Dict(_) => Some(0), } } - - pub(crate) fn uncompressed_size(&self) -> usize { - match self { - CompressedPage::Data(page) => page.uncompressed_page_size, - CompressedPage::Dict(page) => page.uncompressed_page_size, - } - } } /// An uncompressed, encoded dictionary page. diff --git a/crates/polars-parquet/src/parquet/read/compression.rs b/crates/polars-parquet/src/parquet/read/compression.rs index a3d2db312ada..a79989c39e26 100644 --- a/crates/polars-parquet/src/parquet/read/compression.rs +++ b/crates/polars-parquet/src/parquet/read/compression.rs @@ -54,75 +54,72 @@ fn decompress_v2( Ok(()) } -/// decompresses a [`CompressedDataPage`] into `buffer`. -/// If the page is un-compressed, `buffer` is swapped instead. -/// Returns whether the page was decompressed. -pub fn decompress_buffer( - compressed_page: &mut CompressedPage, - buffer: &mut Vec, -) -> ParquetResult { - if compressed_page.compression() != Compression::Uncompressed { - // prepare the compression buffer - let read_size = compressed_page.uncompressed_size(); - - if read_size > buffer.capacity() { - // dealloc and ignore region, replacing it by a new region. - // This won't reallocate - it frees and calls `alloc_zeroed` - *buffer = vec![0; read_size]; - } else if read_size > buffer.len() { - // fill what we need with zeros so that we can use them in `Read`. - // This won't reallocate - buffer.resize(read_size, 0); - } else { - buffer.truncate(read_size); - } - match compressed_page { - CompressedPage::Data(compressed_page) => match compressed_page.header() { - DataPageHeader::V1(_) => { - decompress_v1(&compressed_page.buffer, compressed_page.compression, buffer)? - }, - DataPageHeader::V2(header) => decompress_v2( - &compressed_page.buffer, - header, - compressed_page.compression, - buffer, - )?, - }, - CompressedPage::Dict(page) => decompress_v1(&page.buffer, page.compression(), buffer)?, - } - Ok(true) - } else { - // page.buffer is already decompressed => swap it with `buffer`, making `page.buffer` the - // decompression buffer and `buffer` the decompressed buffer - std::mem::swap(&mut compressed_page.buffer().to_vec(), buffer); - Ok(false) - } -} - -fn create_page(compressed_page: CompressedPage, buffer: Vec) -> Page { - match compressed_page { - CompressedPage::Data(page) => Page::Data(DataPage::new_read( +/// Decompresses the page, using `buffer` for decompression. +/// If `page.buffer.len() == 0`, there was no decompression and the buffer was moved. +/// Else, decompression took place. +pub fn decompress(compressed_page: CompressedPage, buffer: &mut Vec) -> ParquetResult { + Ok(match (compressed_page.compression(), compressed_page) { + (Compression::Uncompressed, CompressedPage::Data(page)) => Page::Data(DataPage::new_read( page.header, - CowBuffer::Owned(buffer), + page.buffer, page.descriptor, )), - CompressedPage::Dict(page) => Page::Dict(DictPage { - buffer: CowBuffer::Owned(buffer), + (_, CompressedPage::Data(page)) => { + // prepare the compression buffer + let read_size = page.uncompressed_size(); + + if read_size > buffer.capacity() { + // dealloc and ignore region, replacing it by a new region. + // This won't reallocate - it frees and calls `alloc_zeroed` + *buffer = vec![0; read_size]; + } else if read_size > buffer.len() { + // fill what we need with zeros so that we can use them in `Read`. + // This won't reallocate + buffer.resize(read_size, 0); + } else { + buffer.truncate(read_size); + } + + match page.header() { + DataPageHeader::V1(_) => decompress_v1(&page.buffer, page.compression, buffer)?, + DataPageHeader::V2(header) => { + decompress_v2(&page.buffer, header, page.compression, buffer)? + }, + } + let buffer = CowBuffer::Owned(std::mem::take(buffer)); + + Page::Data(DataPage::new_read(page.header, buffer, page.descriptor)) + }, + (Compression::Uncompressed, CompressedPage::Dict(page)) => Page::Dict(DictPage { + buffer: page.buffer, num_values: page.num_values, is_sorted: page.is_sorted, }), - } -} - -/// Decompresses the page, using `buffer` for decompression. -/// If `page.buffer.len() == 0`, there was no decompression and the buffer was moved. -/// Else, decompression took place. -pub fn decompress( - mut compressed_page: CompressedPage, - buffer: &mut Vec, -) -> ParquetResult { - decompress_buffer(&mut compressed_page, buffer)?; - Ok(create_page(compressed_page, std::mem::take(buffer))) + (_, CompressedPage::Dict(page)) => { + // prepare the compression buffer + let read_size = page.uncompressed_page_size; + + if read_size > buffer.capacity() { + // dealloc and ignore region, replacing it by a new region. + // This won't reallocate - it frees and calls `alloc_zeroed` + *buffer = vec![0; read_size]; + } else if read_size > buffer.len() { + // fill what we need with zeros so that we can use them in `Read`. + // This won't reallocate + buffer.resize(read_size, 0); + } else { + buffer.truncate(read_size); + } + decompress_v1(&page.buffer, page.compression(), buffer)?; + let buffer = CowBuffer::Owned(std::mem::take(buffer)); + + Page::Dict(DictPage { + buffer, + num_values: page.num_values, + is_sorted: page.is_sorted, + }) + }, + }) } type _Decompressor = streaming_decompression::Decompressor< diff --git a/crates/polars-parquet/src/parquet/read/mod.rs b/crates/polars-parquet/src/parquet/read/mod.rs index ffd1534f928c..a66ac4817a8c 100644 --- a/crates/polars-parquet/src/parquet/read/mod.rs +++ b/crates/polars-parquet/src/parquet/read/mod.rs @@ -63,5 +63,5 @@ pub fn get_field_columns<'a>( ) -> impl Iterator { columns .iter() - .filter(move |x| x.descriptor().path_in_schema[0] == field_name) + .filter(move |x| x.descriptor().path_in_schema[0].as_str() == field_name) } diff --git a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs index 3098241d8425..d4f2c692e95d 100644 --- a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs +++ b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs @@ -43,6 +43,7 @@ //! ``` use parquet_format_safe::Type; +use polars_utils::pl_str::PlSmallStr; use types::PrimitiveLogicalType; use super::super::types::{ParquetType, TimeUnit}; @@ -313,7 +314,7 @@ impl<'a> Parser<'a> { .next() .ok_or_else(|| ParquetError::oos("Expected name, found None"))?; let fields = self.parse_child_types()?; - Ok(ParquetType::new_root(name.to_string(), fields)) + Ok(ParquetType::new_root(PlSmallStr::from_str(name), fields)) }, _ => Err(ParquetError::oos( "Message type does not start with 'message'", @@ -389,7 +390,7 @@ impl<'a> Parser<'a> { let fields = self.parse_child_types()?; Ok(ParquetType::from_converted( - name.to_string(), + PlSmallStr::from_str(name), fields, repetition, converted_type, @@ -473,7 +474,7 @@ impl<'a> Parser<'a> { assert_token(self.tokenizer.next(), ";")?; ParquetType::try_from_primitive( - name.to_string(), + PlSmallStr::from_str(name), (physical_type, length).try_into()?, repetition, converted_type, @@ -883,7 +884,7 @@ mod tests { let fields = vec![ ParquetType::try_from_primitive( - "f1".to_string(), + PlSmallStr::from_static("f1"), PhysicalType::FixedLenByteArray(5), Repetition::Optional, None, @@ -891,7 +892,7 @@ mod tests { None, )?, ParquetType::try_from_primitive( - "f2".to_string(), + PlSmallStr::from_static("f2"), PhysicalType::FixedLenByteArray(16), Repetition::Optional, None, @@ -900,7 +901,7 @@ mod tests { )?, ]; - let expected = ParquetType::new_root("root".to_string(), fields); + let expected = ParquetType::new_root(PlSmallStr::from_static("root"), fields); assert_eq!(message, expected); Ok(()) @@ -932,7 +933,7 @@ mod tests { .unwrap(); let a2 = ParquetType::try_from_primitive( - "a2".to_string(), + "a2".into(), PhysicalType::ByteArray, Repetition::Repeated, Some(PrimitiveConvertedType::Utf8), @@ -940,38 +941,38 @@ mod tests { None, )?; let a1 = ParquetType::from_converted( - "a1".to_string(), + "a1".into(), vec![a2], Repetition::Optional, Some(GroupConvertedType::List), None, ); let b2 = ParquetType::from_converted( - "b2".to_string(), + "b2".into(), vec![ - ParquetType::from_physical("b3".to_string(), PhysicalType::Int32), - ParquetType::from_physical("b4".to_string(), PhysicalType::Double), + ParquetType::from_physical("b3".into(), PhysicalType::Int32), + ParquetType::from_physical("b4".into(), PhysicalType::Double), ], Repetition::Repeated, None, None, ); let b1 = ParquetType::from_converted( - "b1".to_string(), + "b1".into(), vec![b2], Repetition::Optional, Some(GroupConvertedType::List), None, ); let a0 = ParquetType::from_converted( - "a0".to_string(), + "a0".into(), vec![a1, b1], Repetition::Required, None, None, ); - let expected = ParquetType::new_root("root".to_string(), vec![a0]); + let expected = ParquetType::new_root("root".into(), vec![a0]); assert_eq!(message, expected); Ok(()) @@ -997,7 +998,7 @@ mod tests { .unwrap(); let f1 = ParquetType::try_from_primitive( - "_1".to_string(), + "_1".into(), PhysicalType::Int32, Repetition::Required, Some(PrimitiveConvertedType::Int8), @@ -1005,7 +1006,7 @@ mod tests { None, )?; let f2 = ParquetType::try_from_primitive( - "_2".to_string(), + "_2".into(), PhysicalType::Int32, Repetition::Required, Some(PrimitiveConvertedType::Int16), @@ -1013,7 +1014,7 @@ mod tests { None, )?; let f3 = ParquetType::try_from_primitive( - "_3".to_string(), + "_3".into(), PhysicalType::Float, Repetition::Required, None, @@ -1021,7 +1022,7 @@ mod tests { None, )?; let f4 = ParquetType::try_from_primitive( - "_4".to_string(), + "_4".into(), PhysicalType::Double, Repetition::Required, None, @@ -1029,7 +1030,7 @@ mod tests { None, )?; let f5 = ParquetType::try_from_primitive( - "_5".to_string(), + "_5".into(), PhysicalType::Int32, Repetition::Optional, None, @@ -1037,7 +1038,7 @@ mod tests { None, )?; let f6 = ParquetType::try_from_primitive( - "_6".to_string(), + "_6".into(), PhysicalType::ByteArray, Repetition::Optional, Some(PrimitiveConvertedType::Utf8), @@ -1047,7 +1048,7 @@ mod tests { let fields = vec![f1, f2, f3, f4, f5, f6]; - let expected = ParquetType::new_root("root".to_string(), fields); + let expected = ParquetType::new_root("root".into(), fields); assert_eq!(message, expected); Ok(()) } @@ -1075,7 +1076,7 @@ mod tests { .parse_message_type()?; let f1 = ParquetType::try_from_primitive( - "_1".to_string(), + "_1".into(), PhysicalType::Int32, Repetition::Required, None, @@ -1083,7 +1084,7 @@ mod tests { None, )?; let f2 = ParquetType::try_from_primitive( - "_2".to_string(), + "_2".into(), PhysicalType::Int32, Repetition::Required, None, @@ -1091,7 +1092,7 @@ mod tests { None, )?; let f3 = ParquetType::try_from_primitive( - "_3".to_string(), + "_3".into(), PhysicalType::Float, Repetition::Required, None, @@ -1099,7 +1100,7 @@ mod tests { None, )?; let f4 = ParquetType::try_from_primitive( - "_4".to_string(), + "_4".into(), PhysicalType::Double, Repetition::Required, None, @@ -1107,7 +1108,7 @@ mod tests { None, )?; let f5 = ParquetType::try_from_primitive( - "_5".to_string(), + "_5".into(), PhysicalType::Int32, Repetition::Optional, None, @@ -1115,7 +1116,7 @@ mod tests { None, )?; let f6 = ParquetType::try_from_primitive( - "_6".to_string(), + "_6".into(), PhysicalType::Int32, Repetition::Optional, None, @@ -1126,7 +1127,7 @@ mod tests { None, )?; let f7 = ParquetType::try_from_primitive( - "_7".to_string(), + "_7".into(), PhysicalType::Int64, Repetition::Optional, None, @@ -1137,7 +1138,7 @@ mod tests { None, )?; let f8 = ParquetType::try_from_primitive( - "_8".to_string(), + "_8".into(), PhysicalType::Int64, Repetition::Optional, None, @@ -1148,7 +1149,7 @@ mod tests { None, )?; let f9 = ParquetType::try_from_primitive( - "_9".to_string(), + "_9".into(), PhysicalType::Int64, Repetition::Optional, None, @@ -1160,7 +1161,7 @@ mod tests { )?; let f10 = ParquetType::try_from_primitive( - "_10".to_string(), + "_10".into(), PhysicalType::ByteArray, Repetition::Optional, None, @@ -1170,7 +1171,7 @@ mod tests { let fields = vec![f1, f2, f3, f4, f5, f6, f7, f8, f9, f10]; - let expected = ParquetType::new_root("root".to_string(), fields); + let expected = ParquetType::new_root("root".into(), fields); assert_eq!(message, expected); Ok(()) } diff --git a/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs b/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs index 90bc12f7fb5c..b0bbe20999bc 100644 --- a/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs +++ b/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs @@ -1,4 +1,5 @@ use parquet_format_safe::SchemaElement; +use polars_utils::pl_str::PlSmallStr; use super::super::types::ParquetType; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -40,7 +41,7 @@ fn from_thrift_helper( let element = elements.get(index).ok_or_else(|| { ParquetError::oos(format!("index {} on SchemaElement is not valid", index)) })?; - let name = element.name.clone(); + let name = PlSmallStr::from_str(element.name.as_str()); let converted_type = element.converted_type; let id = element.field_id; diff --git a/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs b/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs index 27c9d886b2ef..3aef1fe792fa 100644 --- a/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs +++ b/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs @@ -32,7 +32,7 @@ fn to_thrift_helper(schema: &ParquetType, elements: &mut Vec, is_ type_: Some(type_), type_length, repetition_type: Some(field_info.repetition.into()), - name: field_info.name.clone(), + name: field_info.name.to_string(), num_children: None, converted_type, precision: maybe_decimal.map(|x| x.0), @@ -62,7 +62,7 @@ fn to_thrift_helper(schema: &ParquetType, elements: &mut Vec, is_ type_: None, type_length: None, repetition_type: repetition_type.map(|x| x.into()), - name: field_info.name.clone(), + name: field_info.name.to_string(), num_children: Some(fields.len() as i32), converted_type, scale: None, diff --git a/crates/polars-parquet/src/parquet/schema/types/basic_type.rs b/crates/polars-parquet/src/parquet/schema/types/basic_type.rs index b3697fcaa1c3..e882f83516f5 100644 --- a/crates/polars-parquet/src/parquet/schema/types/basic_type.rs +++ b/crates/polars-parquet/src/parquet/schema/types/basic_type.rs @@ -1,3 +1,4 @@ +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde_types")] use serde::{Deserialize, Serialize}; @@ -8,7 +9,7 @@ use super::super::Repetition; #[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] pub struct FieldInfo { /// The field name - pub name: String, + pub name: PlSmallStr, /// The repetition pub repetition: Repetition, /// the optional id, to select fields by id diff --git a/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs b/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs index c5c5642eb1c6..ad703cc884a3 100644 --- a/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs +++ b/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs @@ -1,5 +1,6 @@ // see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md use polars_utils::aliases::*; +use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde_types")] use serde::{Deserialize, Serialize}; @@ -26,7 +27,7 @@ pub struct PrimitiveType { impl PrimitiveType { /// Helper method to create an optional field with no logical or converted types. - pub fn from_physical(name: String, physical_type: PhysicalType) -> Self { + pub fn from_physical(name: PlSmallStr, physical_type: PhysicalType) -> Self { let field_info = FieldInfo { name, repetition: Repetition::Optional, @@ -114,7 +115,7 @@ impl ParquetType { /// Constructors impl ParquetType { - pub(crate) fn new_root(name: String, fields: Vec) -> Self { + pub(crate) fn new_root(name: PlSmallStr, fields: Vec) -> Self { let field_info = FieldInfo { name, repetition: Repetition::Optional, @@ -129,7 +130,7 @@ impl ParquetType { } pub fn from_converted( - name: String, + name: PlSmallStr, fields: Vec, repetition: Repetition, converted_type: Option, @@ -152,7 +153,7 @@ impl ParquetType { /// # Error /// Errors iff the combination of physical, logical and converted type is not valid. pub fn try_from_primitive( - name: String, + name: PlSmallStr, physical_type: PhysicalType, repetition: Repetition, converted_type: Option, @@ -178,12 +179,12 @@ impl ParquetType { /// Helper method to create a [`ParquetType::PrimitiveType`] optional field /// with no logical or converted types. - pub fn from_physical(name: String, physical_type: PhysicalType) -> Self { + pub fn from_physical(name: PlSmallStr, physical_type: PhysicalType) -> Self { ParquetType::PrimitiveType(PrimitiveType::from_physical(name, physical_type)) } pub fn from_group( - name: String, + name: PlSmallStr, repetition: Repetition, converted_type: Option, logical_type: Option, diff --git a/crates/polars-parquet/src/parquet/write/column_chunk.rs b/crates/polars-parquet/src/parquet/write/column_chunk.rs index 3a5a9a504d9c..6ae51a191dc5 100644 --- a/crates/polars-parquet/src/parquet/write/column_chunk.rs +++ b/crates/polars-parquet/src/parquet/write/column_chunk.rs @@ -179,7 +179,11 @@ fn build_column_chunk( let metadata = ColumnMetaData { type_, encodings, - path_in_schema: descriptor.path_in_schema.clone(), + path_in_schema: descriptor + .path_in_schema + .iter() + .map(|x| x.to_string()) + .collect::>(), codec: compression.into(), num_values, total_uncompressed_size, diff --git a/crates/polars-parquet/src/parquet/write/statistics.rs b/crates/polars-parquet/src/parquet/write/statistics.rs index d37256d3ca1e..c006ca44f04a 100644 --- a/crates/polars-parquet/src/parquet/write/statistics.rs +++ b/crates/polars-parquet/src/parquet/write/statistics.rs @@ -164,20 +164,14 @@ mod tests { fn binary() -> ParquetResult<()> { let iter = vec![ BinaryStatistics { - primitive_type: PrimitiveType::from_physical( - "bla".to_string(), - PhysicalType::ByteArray, - ), + primitive_type: PrimitiveType::from_physical("bla".into(), PhysicalType::ByteArray), null_count: Some(0), distinct_count: None, min_value: Some(vec![1, 2]), max_value: Some(vec![3, 4]), }, BinaryStatistics { - primitive_type: PrimitiveType::from_physical( - "bla".to_string(), - PhysicalType::ByteArray, - ), + primitive_type: PrimitiveType::from_physical("bla".into(), PhysicalType::ByteArray), null_count: Some(0), distinct_count: None, min_value: Some(vec![4, 5]), @@ -189,10 +183,7 @@ mod tests { assert_eq!( a, BinaryStatistics { - primitive_type: PrimitiveType::from_physical( - "bla".to_string(), - PhysicalType::ByteArray, - ), + primitive_type: PrimitiveType::from_physical("bla".into(), PhysicalType::ByteArray,), null_count: Some(0), distinct_count: None, min_value: Some(vec![1, 2]), @@ -208,7 +199,7 @@ mod tests { let iter = vec![ FixedLenStatistics { primitive_type: PrimitiveType::from_physical( - "bla".to_string(), + "bla".into(), PhysicalType::FixedLenByteArray(2), ), null_count: Some(0), @@ -218,7 +209,7 @@ mod tests { }, FixedLenStatistics { primitive_type: PrimitiveType::from_physical( - "bla".to_string(), + "bla".into(), PhysicalType::FixedLenByteArray(2), ), null_count: Some(0), @@ -233,7 +224,7 @@ mod tests { a, FixedLenStatistics { primitive_type: PrimitiveType::from_physical( - "bla".to_string(), + "bla".into(), PhysicalType::FixedLenByteArray(2), ), null_count: Some(0), @@ -284,7 +275,7 @@ mod tests { distinct_count: None, min_value: Some(30), max_value: Some(70), - primitive_type: PrimitiveType::from_physical("bla".to_string(), PhysicalType::Int32), + primitive_type: PrimitiveType::from_physical("bla".into(), PhysicalType::Int32), }]; let a = reduce_primitive(iter.iter()); @@ -295,10 +286,7 @@ mod tests { distinct_count: None, min_value: Some(30), max_value: Some(70), - primitive_type: PrimitiveType::from_physical( - "bla".to_string(), - PhysicalType::Int32, - ), + primitive_type: PrimitiveType::from_physical("bla".into(), PhysicalType::Int32,), }, ); diff --git a/crates/polars-pipe/Cargo.toml b/crates/polars-pipe/Cargo.toml index b11a43bbaae8..cec5c8484285 100644 --- a/crates/polars-pipe/Cargo.toml +++ b/crates/polars-pipe/Cargo.toml @@ -28,7 +28,6 @@ enum_dispatch = { version = "0.3" } hashbrown = { workspace = true } num-traits = { workspace = true } rayon = { workspace = true } -smartstring = { workspace = true } [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-pipe/src/executors/operators/projection.rs b/crates/polars-pipe/src/executors/operators/projection.rs index f609a592f8da..67141d0c44a7 100644 --- a/crates/polars-pipe/src/executors/operators/projection.rs +++ b/crates/polars-pipe/src/executors/operators/projection.rs @@ -4,19 +4,19 @@ use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; use polars_core::schema::SchemaRef; use polars_plan::prelude::ProjectionOptions; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use crate::expressions::PhysicalPipedExpr; use crate::operators::{DataChunk, Operator, OperatorResult, PExecutionContext}; #[derive(Clone)] pub(crate) struct SimpleProjectionOperator { - columns: Arc<[SmartString]>, + columns: Arc<[PlSmallStr]>, input_schema: SchemaRef, } impl SimpleProjectionOperator { - pub(crate) fn new(columns: Arc<[SmartString]>, input_schema: SchemaRef) -> Self { + pub(crate) fn new(columns: Arc<[PlSmallStr]>, input_schema: SchemaRef) -> Self { Self { columns, input_schema, @@ -30,11 +30,12 @@ impl Operator for SimpleProjectionOperator { _context: &PExecutionContext, chunk: &DataChunk, ) -> PolarsResult { - let chunk = chunk.with_data( - chunk - .data - .select_with_schema_unchecked(self.columns.as_ref(), &self.input_schema)?, - ); + let check_duplicates = false; + let chunk = chunk.with_data(chunk.data._select_with_schema_impl( + self.columns.as_ref(), + &self.input_schema, + check_duplicates, + )?); Ok(OperatorResult::Finished(chunk)) } fn split(&self, _thread_no: usize) -> Box { diff --git a/crates/polars-pipe/src/executors/operators/reproject.rs b/crates/polars-pipe/src/executors/operators/reproject.rs index ca2bd5cb1e78..0c176b134af8 100644 --- a/crates/polars-pipe/src/executors/operators/reproject.rs +++ b/crates/polars-pipe/src/executors/operators/reproject.rs @@ -1,5 +1,6 @@ use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; +use polars_core::prelude::IndexOfSchema; use polars_core::schema::Schema; use crate::operators::DataChunk; @@ -14,9 +15,12 @@ pub(crate) fn reproject_chunk( // the positions for subsequent calls let chunk_schema = chunk.data.schema(); - let out = chunk - .data - .select_with_schema_unchecked(schema.iter_names(), &chunk_schema)?; + let check_duplicates = false; + let out = chunk.data._select_with_schema_impl( + schema.get_names_owned().as_slice(), + &chunk_schema, + check_duplicates, + )?; *positions = out .get_columns() diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs index 1603de3729fa..ae98e973f5ba 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs @@ -12,6 +12,7 @@ use polars_plan::plans::expr_ir::ExprIR; use polars_plan::plans::{ArenaExprIter, Context}; use polars_plan::prelude::{AExpr, IRAggExpr}; use polars_utils::arena::{Arena, Node}; +use polars_utils::pl_str::PlSmallStr; use polars_utils::IdxSize; use crate::executors::sinks::group_by::aggregates::count::CountAgg; @@ -31,7 +32,7 @@ impl PhysicalIoExpr for Len { unimplemented!() } - fn live_variables(&self) -> Option>> { + fn live_variables(&self) -> Option> { Some(vec![]) } } @@ -39,7 +40,10 @@ impl PhysicalPipedExpr for Len { fn evaluate(&self, chunk: &DataChunk, _lazy_state: &ExecutionState) -> PolarsResult { // the length must match the chunks as the operators expect that // so we fill a null series. - Ok(Series::new_null("", chunk.data.height())) + Ok(Series::new_null( + PlSmallStr::const_default(), + chunk.data.height(), + )) } fn field(&self, _input_schema: &Schema) -> PolarsResult { diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 2bb4f57b46a1..1fd2e2328384 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -271,7 +271,7 @@ impl AggHashTable { cols.extend( key_columns .into_iter() - .map(|arr| Series::try_from(("", arr)).unwrap()), + .map(|arr| Series::try_from((PlSmallStr::const_default(), arr)).unwrap()), ); cols.extend(agg_builders.into_iter().map(|buf| buf.into_series())); physical_agg_to_logical(&mut cols, &self.output_schema); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs index 41967ee85426..55244679e204 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs @@ -65,7 +65,7 @@ impl SpillPayload { schema.with_column(INDEX_COL.into(), IDX_DTYPE); schema.with_column(KEYS_COL.into(), DataType::BinaryOffset); for s in &self.aggs { - schema.with_column(s.name().into(), s.dtype().clone()); + schema.with_column(s.name().clone(), s.dtype().clone()); } schema } @@ -74,9 +74,12 @@ impl SpillPayload { debug_assert_eq!(self.hashes.len(), self.chunk_idx.len()); debug_assert_eq!(self.hashes.len(), self.keys.len()); - let hashes = UInt64Chunked::from_vec(HASH_COL, self.hashes).into_series(); - let chunk_idx = IdxCa::from_vec(INDEX_COL, self.chunk_idx).into_series(); - let keys = BinaryOffsetChunked::with_chunk(KEYS_COL, self.keys).into_series(); + let hashes = + UInt64Chunked::from_vec(PlSmallStr::from_static(HASH_COL), self.hashes).into_series(); + let chunk_idx = + IdxCa::from_vec(PlSmallStr::from_static(INDEX_COL), self.chunk_idx).into_series(); + let keys = BinaryOffsetChunked::with_chunk(PlSmallStr::from_static(KEYS_COL), self.keys) + .into_series(); let mut cols = Vec::with_capacity(self.aggs.len() + 3); cols.push(hashes); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs index 3554c24c7e65..e9edd3b22f25 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/thread_local.rs @@ -139,7 +139,7 @@ impl SpillPartitions { .zip(self.output_schema.iter_names()) .map(|(b, name)| { let mut s = b.reset(OB_SIZE); - s.rename(name); + s.rename(name.clone()); s }) .collect(), diff --git a/crates/polars-pipe/src/executors/sinks/group_by/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/mod.rs index c2eaafe39d76..7a999e7e7cc7 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/mod.rs @@ -16,7 +16,7 @@ pub(crate) use string::*; pub(super) fn physical_agg_to_logical(cols: &mut [Series], output_schema: &Schema) { for (s, (name, dtype)) in cols.iter_mut().zip(output_schema.iter()) { if s.name() != name { - s.rename(name); + s.rename(name.clone()); } match dtype { #[cfg(feature = "dtype-categorical")] diff --git a/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs index 8294e2512e1e..d20ab9bf2b0d 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs @@ -173,7 +173,7 @@ where let agg_fns = unsafe { std::slice::from_raw_parts_mut(ptr, aggregators_len) }; let mut key_builder = PrimitiveChunkedBuilder::::new( - self.output_schema.get_at_index(0).unwrap().0, + self.output_schema.get_at_index(0).unwrap().0.clone(), agg_map.len(), ); let dtypes = agg_fns diff --git a/crates/polars-pipe/src/executors/sinks/group_by/string.rs b/crates/polars-pipe/src/executors/sinks/group_by/string.rs index 84b2656e11cc..40478e86364e 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/string.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/string.rs @@ -61,7 +61,7 @@ pub struct StringGroupbySink { // by: // * offset = (idx) // * end = (offset + 1) - keys: Vec>, + keys: Vec>, aggregators: Vec, // the key that will be aggregated on key_column: Arc, @@ -186,7 +186,8 @@ impl StringGroupbySink { .collect::>(); let cap = std::cmp::min(slice_len, agg_map.len()); - let mut key_builder = StringChunkedBuilder::new("", cap); + let mut key_builder = + StringChunkedBuilder::new(PlSmallStr::const_default(), cap); agg_map.into_iter().skip(offset).take(slice_len).for_each( |(k, &offset)| { let key_offset = k.idx as usize; @@ -582,7 +583,7 @@ fn get_entry<'a>( key_val: Option<&str>, h: u64, current_partition: &'a mut PlIdHashMap, - keys: &[Option], + keys: &[Option], ) -> RawEntryMut<'a, Key, IdxSize, IdBuildHasher> { current_partition.raw_entry_mut().from_hash(h, |key| { // first compare the hash before we incur the cache miss diff --git a/crates/polars-pipe/src/executors/sinks/io.rs b/crates/polars-pipe/src/executors/sinks/io.rs index 15cc6f8f2537..d8561c1a4c73 100644 --- a/crates/polars-pipe/src/executors/sinks/io.rs +++ b/crates/polars-pipe/src/executors/sinks/io.rs @@ -170,6 +170,7 @@ impl IOThread { if let Some(partitions) = partitions { for (part, mut df) in partitions.into_no_null_iter().zip(iter) { df.shrink_to_fit(); + df.align_chunks(); let mut path = dir2.clone(); path.push(format!("{part}")); @@ -193,6 +194,7 @@ impl IOThread { for mut df in iter { df.shrink_to_fit(); + df.align_chunks(); writer.write_batch(&df).unwrap(); } writer.finish().unwrap(); @@ -240,7 +242,10 @@ impl IOThread { } pub(in crate::executors::sinks) fn dump_partition(&self, partition_no: IdxSize, df: DataFrame) { - let partition = Some(IdxCa::from_vec("", vec![partition_no])); + let partition = Some(IdxCa::from_vec( + PlSmallStr::const_default(), + vec![partition_no], + )); let iter = Box::new(std::iter::once(df)); self.dump_iter(partition, iter) } diff --git a/crates/polars-pipe/src/executors/sinks/joins/cross.rs b/crates/polars-pipe/src/executors/sinks/joins/cross.rs index bac7c8243139..d6014c344978 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/cross.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/cross.rs @@ -8,7 +8,7 @@ use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; use polars_ops::prelude::CrossJoin as CrossJoinTrait; use polars_utils::arena::Node; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use crate::executors::operators::PlaceHolder; use crate::operators::{ @@ -19,7 +19,7 @@ use crate::operators::{ #[derive(Default)] pub struct CrossJoin { chunks: Vec, - suffix: SmartString, + suffix: PlSmallStr, swapped: bool, node: Node, placeholder: PlaceHolder, @@ -27,7 +27,7 @@ pub struct CrossJoin { impl CrossJoin { pub(crate) fn new( - suffix: SmartString, + suffix: PlSmallStr, swapped: bool, node: Node, placeholder: PlaceHolder, @@ -73,7 +73,7 @@ impl Sink for CrossJoin { fn finalize(&mut self, _context: &PExecutionContext) -> PolarsResult { let op = Box::new(CrossJoinProbe { df: Arc::new(chunks_to_df_unchecked(std::mem::take(&mut self.chunks))), - suffix: Arc::from(self.suffix.as_ref()), + suffix: self.suffix.clone(), in_process_left: None, in_process_right: None, in_process_left_df: Default::default(), @@ -97,11 +97,11 @@ impl Sink for CrossJoin { #[derive(Clone)] pub struct CrossJoinProbe { df: Arc, - suffix: Arc, + suffix: PlSmallStr, in_process_left: Option>>, in_process_right: Option>>, in_process_left_df: DataFrame, - output_names: Option>, + output_names: Option>, swapped: bool, } @@ -159,7 +159,7 @@ impl Operator for CrossJoinProbe { (&self.in_process_left_df, &right_df) }; - let mut df = a.cross_join(b, Some(self.suffix.as_ref()), None)?; + let mut df = a.cross_join(b, Some(self.suffix.clone()), None)?; // Cross joins can produce multiple chunks. // No parallelize in operators df.as_single_chunk(); @@ -183,7 +183,7 @@ impl Operator for CrossJoinProbe { // this we can amortize the name allocations. let mut df = match &self.output_names { None => { - let df = a.cross_join(b, Some(self.suffix.as_ref()), None)?; + let df = a.cross_join(b, Some(self.suffix.clone()), None)?; self.output_names = Some(df.get_column_names_owned()); df }, diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs index 3e1b4920be2d..9703988e1eb3 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs @@ -6,9 +6,9 @@ use polars_core::prelude::*; use polars_core::utils::{_set_partition_size, accumulate_dataframes_vertical_unchecked}; use polars_ops::prelude::JoinArgs; use polars_utils::arena::Node; +use polars_utils::pl_str::PlSmallStr; use polars_utils::slice::GetSaferUnchecked; use polars_utils::unitvec; -use smartstring::alias::String as SmartString; use super::*; use crate::executors::operators::PlaceHolder; @@ -32,7 +32,7 @@ pub struct GenericBuild { // * chunk_offset = (idx * n_join_keys) // * end = (offset + n_join_keys) materialized_join_cols: Vec>, - suffix: Arc, + suffix: PlSmallStr, hb: PlRandomState, join_args: JoinArgs, // partitioned tables that will be used for probing @@ -50,23 +50,23 @@ pub struct GenericBuild { swapped: bool, join_nulls: bool, node: Node, - key_names_left: Arc<[SmartString]>, - key_names_right: Arc<[SmartString]>, + key_names_left: Arc<[PlSmallStr]>, + key_names_right: Arc<[PlSmallStr]>, placeholder: PlaceHolder, } impl GenericBuild { #[allow(clippy::too_many_arguments)] pub(crate) fn new( - suffix: Arc, + suffix: PlSmallStr, join_args: JoinArgs, swapped: bool, join_columns_left: Arc>>, join_columns_right: Arc>>, join_nulls: bool, node: Node, - key_names_left: Arc<[SmartString]>, - key_names_right: Arc<[SmartString]>, + key_names_left: Arc<[PlSmallStr]>, + key_names_right: Arc<[PlSmallStr]>, placeholder: PlaceHolder, ) -> Self { let hb: PlRandomState = Default::default(); diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_inner_left.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_inner_left.rs index f2c49e3fe5e5..5337d517cb79 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_inner_left.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_inner_left.rs @@ -7,7 +7,7 @@ use polars_ops::chunked_array::DfTake; use polars_ops::frame::join::_finish_join; use polars_ops::prelude::{JoinArgs, JoinType}; use polars_utils::nulls::IsNull; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use crate::executors::sinks::joins::generic_build::*; use crate::executors::sinks::joins::row_values::RowValues; @@ -29,7 +29,7 @@ pub struct GenericJoinProbe { /// * chunk_offset = (idx * n_join_keys) /// * end = (offset + n_join_keys) materialized_join_cols: Arc<[BinaryArray]>, - suffix: Arc, + suffix: PlSmallStr, hb: PlRandomState, /// partitioned tables that will be used for probing /// stores the key and the chunk_idx, df_idx of the left table @@ -46,7 +46,7 @@ pub struct GenericJoinProbe { /// the join order is swapped to ensure we hash the smaller table swapped_or_left: bool, /// cached output names - output_names: Option>, + output_names: Option>, args: JoinArgs, join_nulls: bool, row_values: RowValues, @@ -57,7 +57,7 @@ impl GenericJoinProbe { pub(super) fn new( mut df_a: DataFrame, materialized_join_cols: Arc<[BinaryArray]>, - suffix: Arc, + suffix: PlSmallStr, hb: PlRandomState, hash_tables: Arc>, join_columns_left: Arc>>, @@ -83,10 +83,10 @@ impl GenericJoinProbe { phys_e .evaluate(&tmp, &context.execution_state) .ok() - .map(|s| s.name().to_string()) + .map(|s| s.name().clone()) }) - .collect::>(); - df_a = df_a.drop_many(&names) + .collect::>(); + df_a = df_a.drop_many_amortized(&names) } GenericJoinProbe { @@ -113,7 +113,7 @@ impl GenericJoinProbe { ) -> PolarsResult { Ok(match &self.output_names { None => { - let out = _finish_join(left_df, right_df, Some(self.suffix.as_ref()))?; + let out = _finish_join(left_df, right_df, Some(self.suffix.clone()))?; self.output_names = Some(out.get_column_names_owned()); out }, @@ -129,7 +129,7 @@ impl GenericJoinProbe { .iter_mut() .zip(names) .for_each(|(s, name)| { - s.rename(name); + s.rename(name.clone()); }); left_df }, diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs index 287e00d6b6a2..0157fe660de5 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs @@ -6,7 +6,7 @@ use polars_core::series::IsSorted; use polars_ops::chunked_array::DfTake; use polars_ops::frame::join::_finish_join; use polars_ops::prelude::_coalesce_full_join; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use crate::executors::sinks::joins::generic_build::*; use crate::executors::sinks::joins::row_values::RowValues; @@ -31,7 +31,7 @@ pub struct GenericFullOuterJoinProbe { /// * chunk_offset = (idx * n_join_keys) /// * end = (offset + n_join_keys) materialized_join_cols: Arc<[BinaryArray]>, - suffix: Arc, + suffix: PlSmallStr, hb: PlRandomState, /// partitioned tables that will be used for probing. /// stores the key and the chunk_idx, df_idx of the left table. @@ -48,13 +48,13 @@ pub struct GenericFullOuterJoinProbe { // the join order is swapped to ensure we hash the smaller table swapped: bool, // cached output names - output_names: Option>, + output_names: Option>, join_nulls: bool, coalesce: bool, thread_no: usize, row_values: RowValues, - key_names_left: Arc<[SmartString]>, - key_names_right: Arc<[SmartString]>, + key_names_left: Arc<[PlSmallStr]>, + key_names_right: Arc<[PlSmallStr]>, } impl GenericFullOuterJoinProbe { @@ -62,7 +62,7 @@ impl GenericFullOuterJoinProbe { pub(super) fn new( df_a: DataFrame, materialized_join_cols: Arc<[BinaryArray]>, - suffix: Arc, + suffix: PlSmallStr, hb: PlRandomState, hash_tables: Arc>, join_columns_right: Arc>>, @@ -71,8 +71,8 @@ impl GenericFullOuterJoinProbe { amortized_hashes: Vec, join_nulls: bool, coalesce: bool, - key_names_left: Arc<[SmartString]>, - key_names_right: Arc<[SmartString]>, + key_names_left: Arc<[PlSmallStr]>, + key_names_right: Arc<[PlSmallStr]>, ) -> Self { GenericFullOuterJoinProbe { df_a: Arc::new(df_a), @@ -99,9 +99,9 @@ impl GenericFullOuterJoinProbe { fn inner( left_df: DataFrame, right_df: DataFrame, - suffix: &str, + suffix: PlSmallStr, swapped: bool, - output_names: &mut Option>, + output_names: &mut Option>, ) -> PolarsResult { let (mut left_df, right_df) = if swapped { (right_df, left_df) @@ -126,7 +126,7 @@ impl GenericFullOuterJoinProbe { .iter_mut() .zip(names) .for_each(|(s, name)| { - s.rename(name); + s.rename(name.clone()); }); left_df }, @@ -137,32 +137,24 @@ impl GenericFullOuterJoinProbe { let out = inner( left_df.clone(), right_df, - self.suffix.as_ref(), + self.suffix.clone(), self.swapped, &mut self.output_names, )?; - let l = self - .key_names_left - .iter() - .map(|s| s.as_str()) - .collect::>(); - let r = self - .key_names_right - .iter() - .map(|s| s.as_str()) - .collect::>(); + let l = self.key_names_left.iter().cloned().collect::>(); + let r = self.key_names_right.iter().cloned().collect::>(); Ok(_coalesce_full_join( out, - &l, - &r, - Some(self.suffix.as_ref()), + l.as_slice(), + r.as_slice(), + Some(self.suffix.clone()), &left_df, )) } else { inner( left_df.clone(), right_df, - self.suffix.as_ref(), + self.suffix.clone(), self.swapped, &mut self.output_names, ) @@ -276,7 +268,7 @@ impl GenericFullOuterJoinProbe { right_df .get_columns() .iter() - .map(|s| Series::full_null(s.name(), size, s.dtype())) + .map(|s| Series::full_null(s.name().clone(), size, s.dtype())) .collect(), ) }; diff --git a/crates/polars-pipe/src/executors/sinks/reproject.rs b/crates/polars-pipe/src/executors/sinks/reproject.rs index 8d66e102fd92..ecba66f188e4 100644 --- a/crates/polars-pipe/src/executors/sinks/reproject.rs +++ b/crates/polars-pipe/src/executors/sinks/reproject.rs @@ -1,5 +1,6 @@ use std::any::Any; +use polars_core::prelude::IndexOfSchema; use polars_core::schema::SchemaRef; use crate::executors::sources::ReProjectSource; @@ -40,7 +41,7 @@ impl Sink for ReProjectSink { fn finalize(&mut self, context: &PExecutionContext) -> PolarsResult { Ok(match self.sink.finalize(context)? { FinalizedSink::Finished(df) => { - FinalizedSink::Finished(df.select(self.schema.iter_names())?) + FinalizedSink::Finished(df._select_impl(self.schema.get_names_owned().as_slice())?) }, FinalizedSink::Source(source) => { FinalizedSink::Source(Box::new(ReProjectSource::new(self.schema.clone(), source))) diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink.rs b/crates/polars-pipe/src/executors/sinks/sort/sink.rs index 5bd51deba54a..d24c0e40ed25 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink.rs @@ -8,6 +8,7 @@ use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; use polars_core::prelude::{AnyValue, SchemaRef, Series, SortOptions}; use polars_core::utils::accumulate_dataframes_vertical_unchecked; +use polars_utils::pl_str::PlSmallStr; use crate::executors::sinks::io::{block_thread_until_io_thread_done, IOThread}; use crate::executors::sinks::memory::MemTracker; @@ -190,7 +191,9 @@ impl Sink for SortSink { let mut lock = self.io_thread.write().unwrap(); let io_thread = lock.take().unwrap(); - let dist = Series::from_any_values("", &self.dist_sample, true).unwrap(); + let dist = + Series::from_any_values(PlSmallStr::const_default(), &self.dist_sample, true) + .unwrap(); let dist = dist.sort_with(SortOptions::from(&self.sort_options))?; let instant = self.ooc_start.unwrap(); diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index c7256f084aeb..6b796a6f3bc5 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -99,7 +99,8 @@ fn finalize_dataframe( for (sort_idx, arr) in sort_idx.into_iter().zip(arrays) { let (name, logical_dtype) = schema.get_at_index(sort_idx).unwrap(); assert_eq!(logical_dtype.to_physical(), DataType::from(arr.data_type())); - let col = Series::from_chunks_and_dtype_unchecked(name, vec![arr], logical_dtype); + let col = + Series::from_chunks_and_dtype_unchecked(name.clone(), vec![arr], logical_dtype); cols.insert(sort_idx, col); } } @@ -227,7 +228,7 @@ impl SortSinkMultiple { let rows_encoded = polars_row::convert_columns(&self.sort_column, &self.sort_fields); let column = unsafe { Series::from_chunks_and_dtype_unchecked( - POLARS_SORT_COLUMN, + PlSmallStr::from_static(POLARS_SORT_COLUMN), vec![Box::new(rows_encoded.into_array())], &DataType::BinaryOffset, ) diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 5155d7bdfcff..2c34228bada6 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -125,7 +125,8 @@ impl CsvSource { }; if let Some(col) = &file_options.include_file_paths { - self.include_file_path = Some(StringChunked::full(col, path.to_str().unwrap(), 1)); + self.include_file_path = + Some(StringChunked::full(col.clone(), path.to_str().unwrap(), 1)); }; self.reader = Some(reader); diff --git a/crates/polars-pipe/src/operators/chunks.rs b/crates/polars-pipe/src/operators/chunks.rs index 10b89784eaa3..1c78a32dde80 100644 --- a/crates/polars-pipe/src/operators/chunks.rs +++ b/crates/polars-pipe/src/operators/chunks.rs @@ -138,7 +138,7 @@ mod test { .iter() .enumerate() .map(|(i, length)| { - let series = Series::new("val", vec![i as u64; *length]); + let series = Series::new("val".into(), vec![i as u64; *length]); DataFrame::new(vec![series]).unwrap() }) .collect(); diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs index 1f080941b38b..1e6f93eac9df 100644 --- a/crates/polars-pipe/src/pipeline/convert.rs +++ b/crates/polars-pipe/src/pipeline/convert.rs @@ -131,7 +131,7 @@ where self.p.evaluate_io(df) } - fn live_variables(&self) -> Option>> { + fn live_variables(&self) -> Option> { None } @@ -259,7 +259,7 @@ where match &options.args.how { #[cfg(feature = "cross_join")] JoinType::Cross => Box::new(CrossJoin::new( - options.args.suffix().into(), + options.args.suffix().clone(), swapped, node, placeholder, @@ -293,7 +293,7 @@ where let (join_columns_left, join_columns_right) = swap_eval(); Box::new(GenericBuild::<()>::new( - Arc::from(options.args.suffix()), + options.args.suffix().clone(), options.args.clone(), swapped, join_columns_left, @@ -320,7 +320,7 @@ where let (join_columns_left, join_columns_right) = swap_eval(); Box::new(GenericBuild::::new( - Arc::from(options.args.suffix()), + options.args.suffix().clone(), options.args.clone(), swapped, join_columns_left, @@ -390,7 +390,7 @@ where let keys = input_schema .iter_names() .map(|name| { - let name: Arc = Arc::from(name.as_str()); + let name: PlSmallStr = name.clone(); let node = expr_arena.add(AExpr::Column(name.clone())); ExprIR::new(node, OutputName::Alias(name)) }) @@ -421,7 +421,7 @@ where input_schema.get_full(name.as_str()).unwrap(); group_by_out_schema.with_column(name.clone(), dtype.clone()); - let name: Arc = Arc::from(name.as_str()); + let name: PlSmallStr = name.clone(); let col = expr_arena.add(AExpr::Column(name.clone())); let node = match options.keep_strategy { UniqueKeepStrategy::First | UniqueKeepStrategy::Any => { diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 5d1cdc79ab15..b37b9b445f10 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -41,7 +41,6 @@ recursive = { workspace = true } regex = { workspace = true, optional = true } serde = { workspace = true, features = ["rc"], optional = true } serde_json = { workspace = true, optional = true } -smartstring = { workspace = true } strum_macros = { workspace = true } [build-dependencies] @@ -57,6 +56,7 @@ serde = [ "polars-time/serde", "polars-io/serde", "polars-ops/serde", + "polars-utils/serde", "either/serde", ] streaming = [] diff --git a/crates/polars-plan/src/constants.rs b/crates/polars-plan/src/constants.rs index 2ae0c0e47c47..e63ad1193774 100644 --- a/crates/polars-plan/src/constants.rs +++ b/crates/polars-plan/src/constants.rs @@ -1,22 +1,22 @@ -use std::sync::{Arc, OnceLock}; +use std::sync::OnceLock; -use crate::prelude::ColumnName; +use polars_utils::pl_str::PlSmallStr; pub static MAP_LIST_NAME: &str = "map_list"; pub static CSE_REPLACED: &str = "__POLARS_CSER_"; pub const LEN: &str = "len"; -pub const LITERAL_NAME: &str = "literal"; +const LITERAL_NAME: &str = "literal"; pub const UNLIMITED_CACHE: u32 = u32::MAX; // Cache the often used LITERAL and LEN constants -static LITERAL_NAME_INIT: OnceLock> = OnceLock::new(); -static LEN_INIT: OnceLock> = OnceLock::new(); +static LITERAL_NAME_INIT: OnceLock = OnceLock::new(); +static LEN_INIT: OnceLock = OnceLock::new(); -pub(crate) fn get_literal_name() -> Arc { - LITERAL_NAME_INIT - .get_or_init(|| ColumnName::from(LITERAL_NAME)) - .clone() +pub fn get_literal_name() -> &'static PlSmallStr { + LITERAL_NAME_INIT.get_or_init(|| PlSmallStr::from_static(LITERAL_NAME)) } -pub(crate) fn get_len_name() -> Arc { - LEN_INIT.get_or_init(|| ColumnName::from(LEN)).clone() +pub(crate) fn get_len_name() -> PlSmallStr { + LEN_INIT + .get_or_init(|| PlSmallStr::from_static(LEN)) + .clone() } diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index c8ab3ae66d56..558a7a98a42a 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -174,7 +174,7 @@ impl ArrayNameSpace { let fields = (0..*width) .map(|i| { let name = arr_default_struct_name_gen(i); - Field::from_owned(name, inner.as_ref().clone()) + Field::new(name, inner.as_ref().clone()) }) .collect(); Ok(DataType::Struct(fields)) diff --git a/crates/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs index da6accf80c6f..4553f87360e0 100644 --- a/crates/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -70,9 +70,9 @@ impl AsRef for AggExpr { #[must_use] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Expr { - Alias(Arc, ColumnName), - Column(ColumnName), - Columns(Arc<[ColumnName]>), + Alias(Arc, PlSmallStr), + Column(PlSmallStr), + Columns(Arc<[PlSmallStr]>), DtypeColumn(Vec), IndexColumn(Arc<[i64]>), Literal(LiteralValue), @@ -136,6 +136,7 @@ pub enum Expr { length: Arc, }, /// Can be used in a select statement to exclude a column from selection + /// TODO: See if we can replace `Vec` with `Arc` Exclude(Arc, Vec), /// Set root name as Alias KeepName(Arc), @@ -149,7 +150,7 @@ pub enum Expr { expr: Arc, }, #[cfg(feature = "dtype-struct")] - Field(Arc<[ColumnName]>), + Field(Arc<[PlSmallStr]>), AnonymousFunction { /// function arguments input: Vec, @@ -303,7 +304,7 @@ impl Default for Expr { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Excluded { - Name(ColumnName), + Name(PlSmallStr), Dtype(DataType), } diff --git a/crates/polars-plan/src/dsl/expr_dyn_fn.rs b/crates/polars-plan/src/dsl/expr_dyn_fn.rs index d2593d0e3bcb..911a0c4308b2 100644 --- a/crates/polars-plan/src/dsl/expr_dyn_fn.rs +++ b/crates/polars-plan/src/dsl/expr_dyn_fn.rs @@ -124,11 +124,11 @@ impl Default for SpecialEq> { } pub trait RenameAliasFn: Send + Sync { - fn call(&self, name: &str) -> PolarsResult; + fn call(&self, name: &PlSmallStr) -> PolarsResult; } -impl PolarsResult + Send + Sync> RenameAliasFn for F { - fn call(&self, name: &str) -> PolarsResult { +impl PolarsResult + Send + Sync> RenameAliasFn for F { + fn call(&self, name: &PlSmallStr) -> PolarsResult { self(name) } } @@ -269,7 +269,7 @@ impl GetOutput { pub fn from_type(dt: DataType) -> Self { SpecialEq::new(Arc::new(move |_: &Schema, _: Context, flds: &[Field]| { - Ok(Field::new(flds[0].name(), dt.clone())) + Ok(Field::new(flds[0].name().clone(), dt.clone())) })) } diff --git a/crates/polars-plan/src/dsl/from.rs b/crates/polars-plan/src/dsl/from.rs index eeaa631521cb..dcc53f51e1f9 100644 --- a/crates/polars-plan/src/dsl/from.rs +++ b/crates/polars-plan/src/dsl/from.rs @@ -8,7 +8,7 @@ impl From for Expr { impl From<&str> for Expr { fn from(s: &str) -> Self { - col(s) + col(PlSmallStr::from_str(s)) } } diff --git a/crates/polars-plan/src/dsl/function_expr/arg_where.rs b/crates/polars-plan/src/dsl/function_expr/arg_where.rs index 74bafa243e00..8f77be0724bd 100644 --- a/crates/polars-plan/src/dsl/function_expr/arg_where.rs +++ b/crates/polars-plan/src/dsl/function_expr/arg_where.rs @@ -6,7 +6,11 @@ pub(super) fn arg_where(s: &mut [Series]) -> PolarsResult> { let predicate = s[0].bool()?; if predicate.is_empty() { - Ok(Some(Series::full_null(predicate.name(), 0, &IDX_DTYPE))) + Ok(Some(Series::full_null( + predicate.name().clone(), + 0, + &IDX_DTYPE, + ))) } else { let capacity = predicate.sum().unwrap(); let mut out = Vec::with_capacity(capacity as usize); @@ -32,7 +36,7 @@ pub(super) fn arg_where(s: &mut [Series]) -> PolarsResult> { total_offset += arr.len(); }); - let ca = IdxCa::with_chunk(predicate.name(), IdxArr::from_vec(out)); + let ca = IdxCa::with_chunk(predicate.name().clone(), IdxArr::from_vec(out)); Ok(Some(ca.into_series())) } } diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index ece457aa7143..0de5e9d99883 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -221,7 +221,9 @@ pub(super) fn contains(s: &[Series]) -> PolarsResult { polars_ensure!(matches!(array.dtype(), DataType::Array(_, _)), SchemaMismatch: "invalid series dtype: expected `Array`, got `{}`", array.dtype(), ); - Ok(is_in(item, array)?.with_name(array.name()).into_series()) + Ok(is_in(item, array)? + .with_name(array.name().clone()) + .into_series()) } #[cfg(feature = "array_count")] diff --git a/crates/polars-plan/src/dsl/function_expr/binary.rs b/crates/polars-plan/src/dsl/function_expr/binary.rs index 3a2525fec060..f803ba0ba952 100644 --- a/crates/polars-plan/src/dsl/function_expr/binary.rs +++ b/crates/polars-plan/src/dsl/function_expr/binary.rs @@ -86,7 +86,10 @@ impl From for SpecialEq> { pub(super) fn contains(s: &[Series]) -> PolarsResult { let ca = s[0].binary()?; let lit = s[1].binary()?; - Ok(ca.contains_chunked(lit).with_name(ca.name()).into_series()) + Ok(ca + .contains_chunked(lit) + .with_name(ca.name().clone()) + .into_series()) } pub(super) fn ends_with(s: &[Series]) -> PolarsResult { @@ -95,7 +98,7 @@ pub(super) fn ends_with(s: &[Series]) -> PolarsResult { Ok(ca .ends_with_chunked(suffix) - .with_name(ca.name()) + .with_name(ca.name().clone()) .into_series()) } @@ -105,7 +108,7 @@ pub(super) fn starts_with(s: &[Series]) -> PolarsResult { Ok(ca .starts_with_chunked(prefix) - .with_name(ca.name()) + .with_name(ca.name().clone()) .into_series()) } diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs index d77da88f69a7..8816113c3636 100644 --- a/crates/polars-plan/src/dsl/function_expr/boolean.rs +++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs @@ -133,18 +133,18 @@ impl From for FunctionExpr { fn any(s: &Series, ignore_nulls: bool) -> PolarsResult { let ca = s.bool()?; if ignore_nulls { - Ok(Series::new(s.name(), [ca.any()])) + Ok(Series::new(s.name().clone(), [ca.any()])) } else { - Ok(Series::new(s.name(), [ca.any_kleene()])) + Ok(Series::new(s.name().clone(), [ca.any_kleene()])) } } fn all(s: &Series, ignore_nulls: bool) -> PolarsResult { let ca = s.bool()?; if ignore_nulls { - Ok(Series::new(s.name(), [ca.all()])) + Ok(Series::new(s.name().clone(), [ca.all()])) } else { - Ok(Series::new(s.name(), [ca.all_kleene()])) + Ok(Series::new(s.name().clone(), [ca.all_kleene()])) } } @@ -217,16 +217,19 @@ fn any_horizontal(s: &[Series]) -> PolarsResult { .install(|| { s.par_iter() .try_fold( - || BooleanChunked::new("", &[false]), + || BooleanChunked::new(PlSmallStr::const_default(), &[false]), |acc, b| { let b = b.cast(&DataType::Boolean)?; let b = b.bool()?; PolarsResult::Ok((&acc).bitor(b)) }, ) - .try_reduce(|| BooleanChunked::new("", [false]), |a, b| Ok(a.bitor(b))) + .try_reduce( + || BooleanChunked::new(PlSmallStr::const_default(), [false]), + |a, b| Ok(a.bitor(b)), + ) })? - .with_name(s[0].name()); + .with_name(s[0].name().clone()); Ok(out.into_series()) } @@ -236,15 +239,18 @@ fn all_horizontal(s: &[Series]) -> PolarsResult { .install(|| { s.par_iter() .try_fold( - || BooleanChunked::new("", &[true]), + || BooleanChunked::new(PlSmallStr::const_default(), &[true]), |acc, b| { let b = b.cast(&DataType::Boolean)?; let b = b.bool()?; PolarsResult::Ok((&acc).bitand(b)) }, ) - .try_reduce(|| BooleanChunked::new("", [true]), |a, b| Ok(a.bitand(b))) + .try_reduce( + || BooleanChunked::new(PlSmallStr::const_default(), [true]), + |a, b| Ok(a.bitand(b)), + ) })? - .with_name(s[0].name()); + .with_name(s[0].name().clone()); Ok(out.into_series()) } diff --git a/crates/polars-plan/src/dsl/function_expr/bounds.rs b/crates/polars-plan/src/dsl/function_expr/bounds.rs index 7dcce34e2a71..0f14feb5675f 100644 --- a/crates/polars-plan/src/dsl/function_expr/bounds.rs +++ b/crates/polars-plan/src/dsl/function_expr/bounds.rs @@ -1,7 +1,7 @@ use super::*; pub(super) fn upper_bound(s: &Series) -> PolarsResult { - let name = s.name(); + let name = s.name().clone(); use DataType::*; let s = match s.dtype().to_physical() { #[cfg(feature = "dtype-i8")] @@ -26,7 +26,7 @@ pub(super) fn upper_bound(s: &Series) -> PolarsResult { } pub(super) fn lower_bound(s: &Series) -> PolarsResult { - let name = s.name(); + let name = s.name().clone(); use DataType::*; let s = match s.dtype().to_physical() { #[cfg(feature = "dtype-i8")] diff --git a/crates/polars-plan/src/dsl/function_expr/cat.rs b/crates/polars-plan/src/dsl/function_expr/cat.rs index db50f4ef4429..9cc5d993a638 100644 --- a/crates/polars-plan/src/dsl/function_expr/cat.rs +++ b/crates/polars-plan/src/dsl/function_expr/cat.rs @@ -46,5 +46,5 @@ fn get_categories(s: &Series) -> PolarsResult { let ca = s.categorical()?; let rev_map = ca.get_rev_map(); let arr = rev_map.get_categories().clone().boxed(); - Series::try_from((ca.name(), arr)) + Series::try_from((ca.name().clone(), arr)) } diff --git a/crates/polars-plan/src/dsl/function_expr/coerce.rs b/crates/polars-plan/src/dsl/function_expr/coerce.rs index b131229b5f44..652866491edb 100644 --- a/crates/polars-plan/src/dsl/function_expr/coerce.rs +++ b/crates/polars-plan/src/dsl/function_expr/coerce.rs @@ -1,5 +1,5 @@ use polars_core::prelude::*; pub fn as_struct(s: &[Series]) -> PolarsResult { - Ok(StructChunked::from_series(s[0].name(), s)?.into_series()) + Ok(StructChunked::from_series(s[0].name().clone(), s)?.into_series()) } diff --git a/crates/polars-plan/src/dsl/function_expr/correlation.rs b/crates/polars-plan/src/dsl/function_expr/correlation.rs index 1510d5145fc1..5437c7c4d795 100644 --- a/crates/polars-plan/src/dsl/function_expr/correlation.rs +++ b/crates/polars-plan/src/dsl/function_expr/correlation.rs @@ -39,7 +39,7 @@ pub(super) fn corr(s: &[Series], ddof: u8, method: CorrelationMethod) -> PolarsR fn covariance(s: &[Series], ddof: u8) -> PolarsResult { let a = &s[0]; let b = &s[1]; - let name = "cov"; + let name = PlSmallStr::from_static("cov"); use polars_ops::chunked_array::cov::cov; let ret = match a.dtype() { @@ -64,13 +64,13 @@ fn covariance(s: &[Series], ddof: u8) -> PolarsResult { fn pearson_corr(s: &[Series], ddof: u8) -> PolarsResult { let a = &s[0]; let b = &s[1]; - let name = "pearson_corr"; + let name = PlSmallStr::from_static("pearson_corr"); use polars_ops::chunked_array::cov::pearson_corr; let ret = match a.dtype() { DataType::Float32 => { let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32); - return Ok(Series::new(name, &[ret])); + return Ok(Series::new(name.clone(), &[ret])); }, DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof), DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap(), ddof), @@ -94,10 +94,10 @@ fn spearman_rank_corr(s: &[Series], ddof: u8, propagate_nans: bool) -> PolarsRes let (a, b) = coalesce_nulls_series(a, b); - let name = "spearman_rank_correlation"; + let name = PlSmallStr::from_static("spearman_rank_correlation"); if propagate_nans && a.dtype().is_float() { for s in [&a, &b] { - if nan_max_s(s, "") + if nan_max_s(s, PlSmallStr::const_default()) .get(0) .unwrap() .extract::() diff --git a/crates/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs index 604c915c817a..1d1d6a5022e4 100644 --- a/crates/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -123,7 +123,7 @@ impl TemporalFunction { time_unit, time_zone, } => Ok(Field::new( - "datetime", + PlSmallStr::from_static("datetime"), DataType::Datetime(*time_unit, time_zone.clone()), )), Combine(tu) => mapper.try_map_dtype(|dt| match dt { diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index cd82ae4251d8..12275fc57200 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -58,11 +58,11 @@ pub(super) fn value_counts( s: &Series, sort: bool, parallel: bool, - name: String, + name: PlSmallStr, normalize: bool, ) -> PolarsResult { s.value_counts(sort, parallel, name, normalize) - .map(|df| df.into_struct(s.name()).into_series()) + .map(|df| df.into_struct(s.name().clone()).into_series()) } #[cfg(feature = "unique_counts")] @@ -121,13 +121,14 @@ pub(super) fn mode(s: &Series) -> PolarsResult { #[cfg(feature = "moment")] pub(super) fn skew(s: &Series, bias: bool) -> PolarsResult { - s.skew(bias).map(|opt_v| Series::new(s.name(), &[opt_v])) + s.skew(bias) + .map(|opt_v| Series::new(s.name().clone(), &[opt_v])) } #[cfg(feature = "moment")] pub(super) fn kurtosis(s: &Series, fisher: bool, bias: bool) -> PolarsResult { s.kurtosis(fisher, bias) - .map(|opt_v| Series::new(s.name(), &[opt_v])) + .map(|opt_v| Series::new(s.name().clone(), &[opt_v])) } pub(super) fn arg_unique(s: &Series) -> PolarsResult { diff --git a/crates/polars-plan/src/dsl/function_expr/fill_null.rs b/crates/polars-plan/src/dsl/function_expr/fill_null.rs index d5e408c0082d..b78b4bf7edbf 100644 --- a/crates/polars-plan/src/dsl/function_expr/fill_null.rs +++ b/crates/polars-plan/src/dsl/function_expr/fill_null.rs @@ -28,7 +28,10 @@ pub(super) fn fill_null(s: &[Series]) -> PolarsResult { let cats = series.to_physical_repr(); let mask = cats.is_not_null(); let out = cats - .zip_with_same_type(&mask, &Series::new("", &[idx])) + .zip_with_same_type( + &mask, + &Series::new(PlSmallStr::const_default(), &[idx]), + ) .unwrap(); unsafe { return out.cast_unchecked(series.dtype()) } } diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index e68b080d17f1..05df577ed8f3 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -247,7 +247,7 @@ pub(super) fn contains(args: &mut [Series]) -> PolarsResult> { SchemaMismatch: "invalid series dtype: expected `List`, got `{}`", list.dtype(), ); polars_ops::prelude::is_in(item, list).map(|mut ca| { - ca.rename(list.name()); + ca.rename(list.name().clone()); Some(ca.into_series()) }) } @@ -378,7 +378,7 @@ pub(super) fn slice(args: &mut [Series]) -> PolarsResult> { .collect_trusted() }, }; - out.rename(s.name()); + out.rename(s.name().clone()); Ok(Some(out.into_series())) } @@ -417,7 +417,7 @@ pub(super) fn get(s: &mut [Series], null_on_oob: bool) -> PolarsResult PolarsResult>()? }; - let s = Series::try_from((ca.name(), arr.values().clone())).unwrap(); + let s = Series::try_from((ca.name().clone(), arr.values().clone())).unwrap(); unsafe { s.take_unchecked(&take_by) } .cast(ca.inner_dtype()) .map(Some) @@ -599,13 +599,13 @@ pub(super) fn set_operation(s: &[Series], set_type: SetOperation) -> PolarsResul if s0.len() == 0 { Ok(s0.clone()) } else { - Ok(s1.clone().with_name(s0.name())) + Ok(s1.clone().with_name(s0.name().clone())) } }, SetOperation::Difference => Ok(s0.clone()), SetOperation::Union | SetOperation::SymmetricDifference => { if s0.len() == 0 { - Ok(s1.clone().with_name(s0.name())) + Ok(s1.clone().with_name(s0.name().clone())) } else { Ok(s0.clone()) } diff --git a/crates/polars-plan/src/dsl/function_expr/log.rs b/crates/polars-plan/src/dsl/function_expr/log.rs index 8793f9614a77..42c71c681f33 100644 --- a/crates/polars-plan/src/dsl/function_expr/log.rs +++ b/crates/polars-plan/src/dsl/function_expr/log.rs @@ -4,9 +4,9 @@ pub(super) fn entropy(s: &Series, base: f64, normalize: bool) -> PolarsResult, - labels: Option>, + labels: Option>, left_closed: bool, include_breaks: bool, }, #[cfg(feature = "cutqcut")] QCut { probs: Vec, - labels: Option>, + labels: Option>, left_closed: bool, allow_duplicates: bool, include_breaks: bool, @@ -307,9 +307,9 @@ pub enum FunctionExpr { /// This will lead to calls over FFI. FfiPlugin { /// Shared library. - lib: Arc, + lib: PlSmallStr, /// Identifier in the shared lib. - symbol: Arc, + symbol: PlSmallStr, /// Pickle serialized keyword arguments. kwargs: Arc<[u8]>, }, @@ -879,7 +879,10 @@ impl From for SpecialEq> { NullCount => { let f = |s: &mut [Series]| { let s = &s[0]; - Ok(Some(Series::new(s.name(), [s.null_count() as IdxSize]))) + Ok(Some(Series::new( + s.name().clone(), + [s.null_count() as IdxSize], + ))) }; wrap!(f) }, diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs index a9bacae5ae84..a12ef242f435 100644 --- a/crates/polars-plan/src/dsl/function_expr/pow.rs +++ b/crates/polars-plan/src/dsl/function_expr/pow.rs @@ -65,7 +65,11 @@ where if exponent.len() == 1 { let Some(exponent_value) = exponent.get(0) else { - return Ok(Some(Series::full_null(base.name(), base.len(), &dtype))); + return Ok(Some(Series::full_null( + base.name().clone(), + base.len(), + &dtype, + ))); }; let s = match exponent_value.to_f64().unwrap() { a if a == 1.0 => base.clone().into_series(), @@ -104,7 +108,11 @@ where if exponent.len() == 1 { let Some(exponent_value) = exponent.get(0) else { - return Ok(Some(Series::full_null(base.name(), base.len(), &dtype))); + return Ok(Some(Series::full_null( + base.name().clone(), + base.len(), + &dtype, + ))); }; let s = match exponent_value.to_u64().unwrap() { 1 => base.clone().into_series(), diff --git a/crates/polars-plan/src/dsl/function_expr/random.rs b/crates/polars-plan/src/dsl/function_expr/random.rs index 1719e42a2feb..cb21e08367aa 100644 --- a/crates/polars-plan/src/dsl/function_expr/random.rs +++ b/crates/polars-plan/src/dsl/function_expr/random.rs @@ -46,7 +46,7 @@ pub(super) fn sample_frac( match frac.get(0) { Some(frac) => src.sample_frac(frac, with_replacement, shuffle, seed), - None => Ok(Series::new_empty(src.name(), src.dtype())), + None => Ok(Series::new_empty(src.name().clone(), src.dtype())), } } @@ -69,6 +69,6 @@ pub(super) fn sample_n( match n.get(0) { Some(n) => src.sample_n(n as usize, with_replacement, shuffle, seed), - None => Ok(Series::new_empty(src.name(), src.dtype())), + None => Ok(Series::new_empty(src.name().clone(), src.dtype())), } } diff --git a/crates/polars-plan/src/dsl/function_expr/range/date_range.rs b/crates/polars-plan/src/dsl/function_expr/range/date_range.rs index bef4946e5729..6d40cbf69498 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/date_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/date_range.rs @@ -25,7 +25,7 @@ pub(super) fn date_range( ComputeError: "`interval` input for `date_range` must consist of full days, got: {interval}" ); - let name = start.name(); + let name = start.name().clone(); let start = temporal_series_to_i64_scalar(&start) .ok_or_else(|| polars_err!(ComputeError: "start is an out-of-range time."))? * MILLISECONDS_IN_DAY; @@ -67,7 +67,7 @@ pub(super) fn date_ranges( let end = end.i64().unwrap() * MILLISECONDS_IN_DAY; let mut builder = ListPrimitiveChunkedBuilder::::new( - start.name(), + start.name().clone(), start.len(), start.len() * CAPACITY_FACTOR, DataType::Int32, @@ -75,7 +75,7 @@ pub(super) fn date_ranges( let range_impl = |start, end, builder: &mut ListPrimitiveChunkedBuilder| { let rng = datetime_range_impl( - "", + PlSmallStr::const_default(), start, end, interval, diff --git a/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs b/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs index e046b94b03a7..3aae78024dd2 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs @@ -95,7 +95,7 @@ pub(super) fn datetime_range( Some(tz) => Some(parse_time_zone(tz)?), _ => None, }; - datetime_range_impl(name, start, end, interval, closed, tu, tz.as_ref())? + datetime_range_impl(name.clone(), start, end, interval, closed, tu, tz.as_ref())? }, _ => unimplemented!(), }; @@ -189,7 +189,7 @@ pub(super) fn datetime_ranges( let out = match dtype { DataType::Datetime(tu, ref tz) => { let mut builder = ListPrimitiveChunkedBuilder::::new( - start.name(), + start.name().clone(), start.len(), start.len() * CAPACITY_FACTOR, DataType::Int64, @@ -201,7 +201,15 @@ pub(super) fn datetime_ranges( _ => None, }; let range_impl = |start, end, builder: &mut ListPrimitiveChunkedBuilder| { - let rng = datetime_range_impl("", start, end, interval, closed, tu, tz.as_ref())?; + let rng = datetime_range_impl( + PlSmallStr::const_default(), + start, + end, + interval, + closed, + tu, + tz.as_ref(), + )?; builder.append_slice(rng.cont_slice().unwrap()); Ok(()) }; @@ -219,7 +227,7 @@ impl<'a> FieldsMapper<'a> { pub(super) fn map_to_datetime_range_dtype( &self, time_unit: Option<&TimeUnit>, - time_zone: Option<&str>, + time_zone: Option<&PlSmallStr>, ) -> PolarsResult { let data_dtype = self.map_to_supertype()?.dtype; @@ -233,10 +241,7 @@ impl<'a> FieldsMapper<'a> { Some(tu) => *tu, None => data_tu, }; - let tz = match time_zone { - Some(tz) => Some(tz.to_string()), - None => data_tz, - }; + let tz = time_zone.cloned().or(data_tz); Ok(DataType::Datetime(tu, tz)) } diff --git a/crates/polars-plan/src/dsl/function_expr/range/int_range.rs b/crates/polars-plan/src/dsl/function_expr/range/int_range.rs index 5344ec0b5ee8..f1ae0ffe13a7 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/int_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/int_range.rs @@ -27,7 +27,7 @@ pub(super) fn int_range(s: &[Series], step: i64, dtype: DataType) -> PolarsResul with_match_physical_integer_polars_type!(dtype, |$T| { let start_v = get_first_series_value::<$T>(start)?; let end_v = get_first_series_value::<$T>(end)?; - new_int_range::<$T>(start_v, end_v, step, name) + new_int_range::<$T>(start_v, end_v, step, name.clone()) }) } @@ -58,7 +58,7 @@ pub(super) fn int_ranges(s: &[Series]) -> PolarsResult { let len = std::cmp::max(start.len(), end.len()); let mut builder = ListPrimitiveChunkedBuilder::::new( // The name should follow our left hand rule. - start.name(), + start.name().clone(), len, len * CAPACITY_FACTOR, DataType::Int64, diff --git a/crates/polars-plan/src/dsl/function_expr/range/mod.rs b/crates/polars-plan/src/dsl/function_expr/range/mod.rs index b13d45bdd73c..3350f0c6f8f5 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/mod.rs @@ -83,7 +83,7 @@ impl RangeFunction { } => { // output dtype may change based on `interval`, `time_unit`, and `time_zone` let dtype = - mapper.map_to_datetime_range_dtype(time_unit.as_ref(), time_zone.as_deref())?; + mapper.map_to_datetime_range_dtype(time_unit.as_ref(), time_zone.as_ref())?; mapper.with_dtype(dtype) }, #[cfg(feature = "dtype-datetime")] @@ -95,7 +95,7 @@ impl RangeFunction { } => { // output dtype may change based on `interval`, `time_unit`, and `time_zone` let inner_dtype = - mapper.map_to_datetime_range_dtype(time_unit.as_ref(), time_zone.as_deref())?; + mapper.map_to_datetime_range_dtype(time_unit.as_ref(), time_zone.as_ref())?; mapper.with_dtype(DataType::List(Box::new(inner_dtype))) }, #[cfg(feature = "dtype-time")] diff --git a/crates/polars-plan/src/dsl/function_expr/range/time_range.rs b/crates/polars-plan/src/dsl/function_expr/range/time_range.rs index 991368356cc5..f2518456fed7 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/time_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/time_range.rs @@ -25,7 +25,7 @@ pub(super) fn time_range( let end = temporal_series_to_i64_scalar(&end.cast(&dtype)?) .ok_or_else(|| polars_err!(ComputeError: "end is an out-of-range time."))?; - let out = time_range_impl(name, start, end, interval, closed)?; + let out = time_range_impl(name.clone(), start, end, interval, closed)?; Ok(out.cast(&dtype).unwrap().into_series()) } @@ -47,14 +47,14 @@ pub(super) fn time_ranges( let len = std::cmp::max(start.len(), end.len()); let mut builder = ListPrimitiveChunkedBuilder::::new( - start.name(), + start.name().clone(), len, len * CAPACITY_FACTOR, DataType::Int64, ); let range_impl = |start, end, builder: &mut ListPrimitiveChunkedBuilder| { - let rng = time_range_impl("", start, end, interval, closed)?; + let rng = time_range_impl(PlSmallStr::const_default(), start, end, interval, closed)?; builder.append_slice(rng.cont_slice().unwrap()); Ok(()) }; diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index a385c27820d6..8f3562d6f176 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -95,7 +95,7 @@ impl FunctionExpr { }), #[cfg(feature = "dtype-struct")] AsStruct => Ok(Field::new( - fields[0].name(), + fields[0].name().clone(), DataType::Struct(fields.to_vec()), )), #[cfg(feature = "top_k")] @@ -115,8 +115,8 @@ impl FunctionExpr { IDX_DTYPE }; DataType::Struct(vec![ - Field::new(fields[0].name().as_str(), dt.clone()), - Field::new(name, count_dt), + Field::new(fields[0].name().clone(), dt.clone()), + Field::new(name.clone(), count_dt), ]) }), #[cfg(feature = "unique_counts")] @@ -143,15 +143,18 @@ impl FunctionExpr { if *include_breakpoint || *include_category { let mut fields = Vec::with_capacity(3); if *include_breakpoint { - fields.push(Field::new("breakpoint", DataType::Float64)); + fields.push(Field::new( + PlSmallStr::from_static("breakpoint"), + DataType::Float64, + )); } if *include_category { fields.push(Field::new( - "category", + PlSmallStr::from_static("category"), DataType::Categorical(None, Default::default()), )); } - fields.push(Field::new("count", IDX_DTYPE)); + fields.push(Field::new(PlSmallStr::from_static("count"), IDX_DTYPE)); mapper.with_dtype(DataType::Struct(fields)) } else { mapper.with_dtype(IDX_DTYPE) @@ -231,8 +234,11 @@ impl FunctionExpr { .. } => { let struct_dt = DataType::Struct(vec![ - Field::new("breakpoint", DataType::Float64), - Field::new("category", DataType::Categorical(None, Default::default())), + Field::new(PlSmallStr::from_static("breakpoint"), DataType::Float64), + Field::new( + PlSmallStr::from_static("category"), + DataType::Categorical(None, Default::default()), + ), ]); mapper.with_dtype(struct_dt) }, @@ -269,16 +275,19 @@ impl FunctionExpr { .. } => { let struct_dt = DataType::Struct(vec![ - Field::new("breakpoint", DataType::Float64), - Field::new("category", DataType::Categorical(None, Default::default())), + Field::new(PlSmallStr::from_static("breakpoint"), DataType::Float64), + Field::new( + PlSmallStr::from_static("category"), + DataType::Categorical(None, Default::default()), + ), ]); mapper.with_dtype(struct_dt) }, #[cfg(feature = "rle")] RLE => mapper.map_dtype(|dt| { DataType::Struct(vec![ - Field::new("len", IDX_DTYPE), - Field::new("value", dt.clone()), + Field::new(PlSmallStr::from_static("len"), IDX_DTYPE), + Field::new(PlSmallStr::from_static("value"), dt.clone()), ]) }), #[cfg(feature = "rle")] @@ -363,13 +372,13 @@ impl<'a> FieldsMapper<'a> { /// Set a dtype. pub fn with_dtype(&self, dtype: DataType) -> PolarsResult { - Ok(Field::new(self.fields[0].name(), dtype)) + Ok(Field::new(self.fields[0].name().clone(), dtype)) } /// Map a single dtype. pub fn map_dtype(&self, func: impl FnOnce(&DataType) -> DataType) -> PolarsResult { let dtype = func(self.fields[0].data_type()); - Ok(Field::new(self.fields[0].name(), dtype)) + Ok(Field::new(self.fields[0].name().clone(), dtype)) } pub fn get_fields_lens(&self) -> usize { @@ -417,7 +426,7 @@ impl<'a> FieldsMapper<'a> { func: impl FnOnce(&DataType) -> PolarsResult, ) -> PolarsResult { let dtype = func(self.fields[0].data_type())?; - Ok(Field::new(self.fields[0].name(), dtype)) + Ok(Field::new(self.fields[0].name().clone(), dtype)) } /// Map all dtypes with a potentially failing mapper function. @@ -515,12 +524,21 @@ impl<'a> FieldsMapper<'a> { let exponent_dtype = self.fields[1].data_type(); if base_dtype.is_integer() { if exponent_dtype.is_float() { - Ok(Field::new(self.fields[0].name(), exponent_dtype.clone())) + Ok(Field::new( + self.fields[0].name().clone(), + exponent_dtype.clone(), + )) } else { - Ok(Field::new(self.fields[0].name(), base_dtype.clone())) + Ok(Field::new( + self.fields[0].name().clone(), + base_dtype.clone(), + )) } } else { - Ok(Field::new(self.fields[0].name(), base_dtype.clone())) + Ok(Field::new( + self.fields[0].name().clone(), + base_dtype.clone(), + )) } } diff --git a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs index c2a0d16d78dc..6ebc5f3d221e 100644 --- a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs +++ b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs @@ -106,7 +106,7 @@ pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { dt => polars_bail!(opq = shift_and_fill, dt), } } else { - Ok(Series::full_null(s.name(), s.len(), s.dtype())) + Ok(Series::full_null(s.name().clone(), s.len(), s.dtype())) } } @@ -123,6 +123,6 @@ pub fn shift(args: &[Series]) -> PolarsResult { match n.get(0) { Some(n) => Ok(s.shift(n)), - None => Ok(Series::full_null(s.name(), s.len(), s.dtype())), + None => Ok(Series::full_null(s.name().clone(), s.len(), s.dtype())), } } diff --git a/crates/polars-plan/src/dsl/function_expr/sign.rs b/crates/polars-plan/src/dsl/function_expr/sign.rs index 41707664e3ac..a7bf4d3277e6 100644 --- a/crates/polars-plan/src/dsl/function_expr/sign.rs +++ b/crates/polars-plan/src/dsl/function_expr/sign.rs @@ -1,41 +1,34 @@ +use num::{One, Zero}; use polars_core::export::num; -use DataType::*; +use polars_core::with_match_physical_numeric_polars_type; use super::*; pub(super) fn sign(s: &Series) -> PolarsResult { - match s.dtype() { - Float32 => { - let ca = s.f32().unwrap(); - sign_float(ca) - }, - Float64 => { - let ca = s.f64().unwrap(); - sign_float(ca) - }, - dt if dt.is_numeric() => { - let s = s.cast(&Float64)?; - sign(&s) - }, - dt => polars_bail!(opq = sign, dt), - } + let dt = s.dtype(); + polars_ensure!(dt.is_numeric(), opq = sign, dt); + with_match_physical_numeric_polars_type!(dt, |$T| { + let ca: &ChunkedArray<$T> = s.as_ref().as_ref(); + Ok(sign_impl(ca)) + }) } -fn sign_float(ca: &ChunkedArray) -> PolarsResult +fn sign_impl(ca: &ChunkedArray) -> Series where - T: PolarsFloatType, - T::Native: num::Float, + T: PolarsNumericType, ChunkedArray: IntoSeries, { - ca.apply_values(signum_improved).into_series().cast(&Int64) -} - -// Wrapper for the signum function that handles +/-0.0 inputs differently -// See discussion here: https://github.com/rust-lang/rust/issues/57543 -fn signum_improved(v: F) -> F { - if v.is_zero() { - v - } else { - v.signum() - } + ca.apply_values(|x| { + if x < T::Native::zero() { + T::Native::zero() - T::Native::one() + } else if x > T::Native::zero() { + T::Native::one() + } else { + // Returning x here ensures we return NaN for NaN input, and + // maintain the sign for signed zeroes (although we don't really + // care about the latter). + x + } + }) + .into_series() } diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 77a9f9e519bb..9a5d2a9ff537 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -7,7 +7,7 @@ use once_cell::sync::Lazy; use polars_core::chunked_array::temporal::validate_time_zone; use polars_core::utils::handle_casting_failures; #[cfg(feature = "dtype-struct")] -use polars_utils::format_smartstring; +use polars_utils::format_pl_smallstr; #[cfg(feature = "regex")] use regex::{escape, Regex}; #[cfg(feature = "serde")] @@ -25,12 +25,12 @@ static TZ_AWARE_RE: Lazy = pub enum StringFunction { #[cfg(feature = "concat_str")] ConcatHorizontal { - delimiter: String, + delimiter: PlSmallStr, ignore_nulls: bool, }, #[cfg(feature = "concat_str")] ConcatVertical { - delimiter: String, + delimiter: PlSmallStr, ignore_nulls: bool, }, #[cfg(feature = "regex")] @@ -45,7 +45,7 @@ pub enum StringFunction { #[cfg(feature = "extract_groups")] ExtractGroups { dtype: DataType, - pat: String, + pat: PlSmallStr, }, #[cfg(feature = "regex")] Find { @@ -182,13 +182,13 @@ impl StringFunction { #[cfg(feature = "dtype-struct")] SplitExact { n, .. } => mapper.with_dtype(DataType::Struct( (0..n + 1) - .map(|i| Field::from_owned(format_smartstring!("field_{i}"), DataType::String)) + .map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String)) .collect(), )), #[cfg(feature = "dtype-struct")] SplitN(n) => mapper.with_dtype(DataType::Struct( (0..*n) - .map(|i| Field::from_owned(format_smartstring!("field_{i}"), DataType::String)) + .map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String)) .collect(), )), #[cfg(feature = "find_many")] @@ -576,7 +576,7 @@ pub(super) fn extract_all(args: &[Series]) -> PolarsResult { ca.extract_all(pat).map(|ca| ca.into_series()) } else { Ok(Series::full_null( - ca.name(), + ca.name().clone(), ca.len(), &DataType::List(Box::new(DataType::String)), )) @@ -596,7 +596,11 @@ pub(super) fn count_matches(args: &[Series], literal: bool) -> PolarsResult), - RenameFields(Arc<[String]>), - PrefixFields(Arc), - SuffixFields(Arc), + FieldByName(PlSmallStr), + RenameFields(Arc<[PlSmallStr]>), + PrefixFields(PlSmallStr), + SuffixFields(PlSmallStr), #[cfg(feature = "json")] JsonEncode, WithFields, - MultipleFields(Arc<[ColumnName]>), + MultipleFields(Arc<[PlSmallStr]>), } impl StructFunction { @@ -50,7 +51,7 @@ impl StructFunction { let fields = fields .iter() .zip(names.as_ref()) - .map(|(fld, name)| Field::new(name, fld.data_type().clone())) + .map(|(fld, name)| Field::new(name.clone(), fld.data_type().clone())) .collect(); DataType::Struct(fields) }, @@ -60,7 +61,7 @@ impl StructFunction { dt => DataType::Struct( names .iter() - .map(|name| Field::new(name, dt.clone())) + .map(|name| Field::new(name.clone(), dt.clone())) .collect(), ), }), @@ -70,7 +71,10 @@ impl StructFunction { .iter() .map(|fld| { let name = fld.name(); - Field::new(&format!("{prefix}{name}"), fld.data_type().clone()) + Field::new( + format_pl_smallstr!("{prefix}{name}"), + fld.data_type().clone(), + ) }) .collect(); Ok(DataType::Struct(fields)) @@ -83,7 +87,10 @@ impl StructFunction { .iter() .map(|fld| { let name = fld.name(); - Field::new(&format!("{name}{suffix}"), fld.data_type().clone()) + Field::new( + format_pl_smallstr!("{name}{suffix}"), + fld.data_type().clone(), + ) }) .collect(); Ok(DataType::Struct(fields)) @@ -108,7 +115,7 @@ impl StructFunction { let dtype = DataType::Struct( name_2_dtype .iter() - .map(|(name, dtype)| Field::new(name, (*dtype).clone())) + .map(|(&name, &dtype)| Field::new(name.clone(), dtype.clone())) .collect(), ); let mut out = struct_.clone(); @@ -146,10 +153,10 @@ impl From for SpecialEq> { use StructFunction::*; match func { FieldByIndex(_) => panic!("should be replaced"), - FieldByName(name) => map!(get_by_name, name.clone()), + FieldByName(name) => map!(get_by_name, &name), RenameFields(names) => map!(rename_fields, names.clone()), - PrefixFields(prefix) => map!(prefix_fields, prefix.clone()), - SuffixFields(suffix) => map!(suffix_fields, suffix.clone()), + PrefixFields(prefix) => map!(prefix_fields, prefix.as_str()), + SuffixFields(suffix) => map!(suffix_fields, suffix.as_str()), #[cfg(feature = "json")] JsonEncode => map!(to_json), WithFields => map_as_slice!(with_fields), @@ -158,12 +165,12 @@ impl From for SpecialEq> { } } -pub(super) fn get_by_name(s: &Series, name: Arc) -> PolarsResult { +pub(super) fn get_by_name(s: &Series, name: &str) -> PolarsResult { let ca = s.struct_()?; - ca.field_by_name(name.as_ref()) + ca.field_by_name(name) } -pub(super) fn rename_fields(s: &Series, names: Arc<[String]>) -> PolarsResult { +pub(super) fn rename_fields(s: &Series, names: Arc<[PlSmallStr]>) -> PolarsResult { let ca = s.struct_()?; let fields = ca .fields_as_series() @@ -171,16 +178,16 @@ pub(super) fn rename_fields(s: &Series, names: Arc<[String]>) -> PolarsResult>(); - let mut out = StructChunked::from_series(ca.name(), &fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; out.zip_outer_validity(ca); Ok(out.into_series()) } -pub(super) fn prefix_fields(s: &Series, prefix: Arc) -> PolarsResult { +pub(super) fn prefix_fields(s: &Series, prefix: &str) -> PolarsResult { let ca = s.struct_()?; let fields = ca .fields_as_series() @@ -188,16 +195,16 @@ pub(super) fn prefix_fields(s: &Series, prefix: Arc) -> PolarsResult>(); - let mut out = StructChunked::from_series(ca.name(), &fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; out.zip_outer_validity(ca); Ok(out.into_series()) } -pub(super) fn suffix_fields(s: &Series, suffix: Arc) -> PolarsResult { +pub(super) fn suffix_fields(s: &Series, suffix: &str) -> PolarsResult { let ca = s.struct_()?; let fields = ca .fields_as_series() @@ -205,11 +212,11 @@ pub(super) fn suffix_fields(s: &Series, suffix: Arc) -> PolarsResult>(); - let mut out = StructChunked::from_series(ca.name(), &fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; out.zip_outer_validity(ca); Ok(out.into_series()) } @@ -224,7 +231,7 @@ pub(super) fn to_json(s: &Series) -> PolarsResult { polars_json::json::write::serialize_to_utf8(arr.as_ref()) }); - Ok(StringChunked::from_chunk_iter(ca.name(), iter).into_series()) + Ok(StringChunked::from_chunk_iter(ca.name().clone(), iter).into_series()) } pub(super) fn with_fields(args: &[Series]) -> PolarsResult { @@ -244,7 +251,7 @@ pub(super) fn with_fields(args: &[Series]) -> PolarsResult { } let new_fields = fields.into_values().cloned().collect::>(); - let mut out = StructChunked::from_series(ca.name(), &new_fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?; out.zip_outer_validity(ca); Ok(out.into_series()) } diff --git a/crates/polars-plan/src/dsl/function_expr/temporal.rs b/crates/polars-plan/src/dsl/function_expr/temporal.rs index fe580259d77e..18340a00adaf 100644 --- a/crates/polars-plan/src/dsl/function_expr/temporal.rs +++ b/crates/polars-plan/src/dsl/function_expr/temporal.rs @@ -178,7 +178,7 @@ pub(super) fn datetime( }; let mut s = ca.into_series(); - s.rename("datetime"); + s.rename(PlSmallStr::from_static("datetime")); Ok(s) } diff --git a/crates/polars-plan/src/dsl/functions/concat.rs b/crates/polars-plan/src/dsl/functions/concat.rs index 6f420c72f768..d15b1769cf3a 100644 --- a/crates/polars-plan/src/dsl/functions/concat.rs +++ b/crates/polars-plan/src/dsl/functions/concat.rs @@ -4,7 +4,7 @@ use super::*; /// Horizontally concat string columns in linear time pub fn concat_str>(s: E, separator: &str, ignore_nulls: bool) -> Expr { let input = s.as_ref().to_vec(); - let separator = separator.to_string(); + let separator = separator.into(); Expr::Function { input, diff --git a/crates/polars-plan/src/dsl/functions/horizontal.rs b/crates/polars-plan/src/dsl/functions/horizontal.rs index 1b49791ebc26..eb0c79b3b0f7 100644 --- a/crates/polars-plan/src/dsl/functions/horizontal.rs +++ b/crates/polars-plan/src/dsl/functions/horizontal.rs @@ -8,11 +8,11 @@ fn cum_fold_dtype() -> GetOutput { st = get_supertype(&st, &fld.dtype).unwrap(); } Ok(Field::new( - &fields[0].name, + fields[0].name.clone(), DataType::Struct( fields .iter() - .map(|fld| Field::new(fld.name(), st.clone())) + .map(|fld| Field::new(fld.name().clone(), st.clone())) .collect(), ), )) @@ -118,15 +118,16 @@ where let mut result = vec![acc.clone()]; for s in s_iter { - let name = s.name().to_string(); + let name = s.name().clone(); if let Some(a) = f(acc.clone(), s.clone())? { acc = a; } - acc.rename(&name); + acc.rename(name); result.push(acc.clone()); } - StructChunked::from_series(acc.name(), &result).map(|ca| Some(ca.into_series())) + StructChunked::from_series(acc.name().clone(), &result) + .map(|ca| Some(ca.into_series())) }, None => Err(polars_err!(ComputeError: "`reduce` did not have any expressions to fold")), } @@ -167,15 +168,15 @@ where } for s in series { - let name = s.name().to_string(); + let name = s.name().clone(); if let Some(a) = f(acc.clone(), s)? { acc = a; - acc.rename(&name); + acc.rename(name); result.push(acc.clone()); } } - StructChunked::from_series(acc.name(), &result).map(|ca| Some(ca.into_series())) + StructChunked::from_series(acc.name().clone(), &result).map(|ca| Some(ca.into_series())) }) as Arc); Expr::AnonymousFunction { diff --git a/crates/polars-plan/src/dsl/functions/index.rs b/crates/polars-plan/src/dsl/functions/index.rs index 7a452a033245..a3c840125181 100644 --- a/crates/polars-plan/src/dsl/functions/index.rs +++ b/crates/polars-plan/src/dsl/functions/index.rs @@ -11,7 +11,7 @@ pub fn arg_sort_by>(by: E, sort_options: SortMultipleOptions) - let name = expr_output_name(e).unwrap(); int_range(lit(0 as IdxSize), len().cast(IDX_DTYPE), 1, IDX_DTYPE) .sort_by(by, sort_options) - .alias(name.as_ref()) + .alias(name) } #[cfg(feature = "arg_where")] diff --git a/crates/polars-plan/src/dsl/functions/repeat.rs b/crates/polars-plan/src/dsl/functions/repeat.rs index 9da42c36242f..5c3084fb7caf 100644 --- a/crates/polars-plan/src/dsl/functions/repeat.rs +++ b/crates/polars-plan/src/dsl/functions/repeat.rs @@ -16,5 +16,6 @@ pub fn repeat>(value: E, n: Expr) -> Expr { )?; Ok(Some(s.new_from_index(0, n))) }; - apply_binary(value.into(), n, function, GetOutput::same_type()).alias("repeat") + apply_binary(value.into(), n, function, GetOutput::same_type()) + .alias(PlSmallStr::from_static("repeat")) } diff --git a/crates/polars-plan/src/dsl/functions/selectors.rs b/crates/polars-plan/src/dsl/functions/selectors.rs index 11c92a40b1ba..28d52c10f835 100644 --- a/crates/polars-plan/src/dsl/functions/selectors.rs +++ b/crates/polars-plan/src/dsl/functions/selectors.rs @@ -24,10 +24,14 @@ use super::*; /// // only if regex features is activated /// col("^foo.*$") /// ``` -pub fn col(name: &str) -> Expr { - match name { +pub fn col(name: S) -> Expr +where + S: Into, +{ + let name = name.into(); + match name.as_str() { "*" => Expr::Wildcard, - _ => Expr::Column(ColumnName::from(name)), + _ => Expr::Column(name), } } @@ -37,12 +41,12 @@ pub fn all() -> Expr { } /// Select multiple columns by name. -pub fn cols>(names: I) -> Expr { - let names = names.into_vec(); - let names = names - .into_iter() - .map(|v| ColumnName::from(v.as_str())) - .collect(); +pub fn cols(names: I) -> Expr +where + I: IntoIterator, + S: Into, +{ + let names = names.into_iter().map(|x| x.into()).collect(); Expr::Columns(names) } diff --git a/crates/polars-plan/src/dsl/functions/temporal.rs b/crates/polars-plan/src/dsl/functions/temporal.rs index 071c4a1e6b84..145b521092d3 100644 --- a/crates/polars-plan/src/dsl/functions/temporal.rs +++ b/crates/polars-plan/src/dsl/functions/temporal.rs @@ -173,7 +173,10 @@ impl DatetimeArgs { TimeUnit::Nanoseconds => dt.and_utc().timestamp_nanos_opt()?, }; - Some(Expr::Literal(LiteralValue::DateTime(ts, self.time_unit, None)).alias("datetime")) + Some( + Expr::Literal(LiteralValue::DateTime(ts, self.time_unit, None)) + .alias(PlSmallStr::from_static("datetime")), + ) } } @@ -394,7 +397,10 @@ impl DurationArgs { TimeUnit::Nanoseconds => delta.num_nanoseconds()?, }; - Some(Expr::Literal(LiteralValue::Duration(d, self.time_unit)).alias("duration")) + Some( + Expr::Literal(LiteralValue::Duration(d, self.time_unit)) + .alias(PlSmallStr::from_static("duration")), + ) } } diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index 3762e0102432..11e825a7ec1f 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -313,7 +313,7 @@ impl ListNameSpace { let fields = (0..upper_bound) .map(|i| { let name = _default_struct_name_gen(i); - Field::from_owned(name, inner.clone()) + Field::new(name, inner.clone()) }) .collect(); let dt = DataType::Struct(fields); diff --git a/crates/polars-plan/src/dsl/meta.rs b/crates/polars-plan/src/dsl/meta.rs index 9329f3f4fca0..0e7a30fa024b 100644 --- a/crates/polars-plan/src/dsl/meta.rs +++ b/crates/polars-plan/src/dsl/meta.rs @@ -24,7 +24,7 @@ impl MetaNameSpace { } /// Get the root column names. - pub fn root_names(&self) -> Vec> { + pub fn root_names(&self) -> Vec { expr_to_leaf_column_names(&self.0) } @@ -37,7 +37,7 @@ impl MetaNameSpace { } /// Get the output name of this expression. - pub fn output_name(&self) -> PolarsResult> { + pub fn output_name(&self) -> PolarsResult { expr_output_name(&self.0) } diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 166c7f5e7962..ab9a2c545db8 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -170,8 +170,11 @@ impl Expr { } /// Rename Column. - pub fn alias(self, name: &str) -> Expr { - Expr::Alias(Arc::new(self), ColumnName::from(name)) + pub fn alias(self, name: S) -> Expr + where + S: Into, + { + Expr::Alias(Arc::new(self), name.into()) } /// Run is_null operation on `Expr`. @@ -322,7 +325,7 @@ impl Expr { self.function_with_options( move |s: Series| { Ok(Some(Series::new( - s.name(), + s.name().clone(), &[s.arg_min().map(|idx| idx as u32)], ))) }, @@ -343,7 +346,7 @@ impl Expr { self.function_with_options( move |s: Series| { Ok(Some(Series::new( - s.name(), + s.name().clone(), &[s.arg_max().map(|idx| idx as IdxSize)], ))) }, @@ -826,7 +829,9 @@ impl Expr { }; self.function_with_options( - move |s: Series| Some(s.product().map(|sc| sc.into_series(s.name()))).transpose(), + move |s: Series| { + Some(s.product().map(|sc| sc.into_series(s.name().clone()))).transpose() + }, GetOutput::map_dtype(|dt| { use DataType as T; Ok(match dt { @@ -1019,7 +1024,7 @@ impl Expr { pub fn rolling(self, options: RollingGroupOptions) -> Self { // We add the index column as `partition expr` so that the optimizer will // not ignore it. - let index_col = col(options.index_column.as_str()); + let index_col = col(options.index_column.clone()); Expr::Window { function: Arc::new(self), partition_by: vec![index_col], @@ -1257,12 +1262,8 @@ impl Expr { /// Exclude a column from a wildcard/regex selection. /// /// You may also use regexes in the exclude as long as they start with `^` and end with `$`/ - pub fn exclude(self, columns: impl IntoVec) -> Expr { - let v = columns - .into_vec() - .into_iter() - .map(|s| Excluded::Name(ColumnName::from(s))) - .collect(); + pub fn exclude(self, columns: impl IntoVec) -> Expr { + let v = columns.into_vec().into_iter().map(Excluded::Name).collect(); Expr::Exclude(Arc::new(self), v) } @@ -1500,8 +1501,8 @@ impl Expr { GetOutput::map_field(|field| { Ok(match field.data_type() { DataType::Float64 => field.clone(), - DataType::Float32 => Field::new(field.name(), DataType::Float32), - _ => Field::new(field.name(), DataType::Float64), + DataType::Float32 => Field::new(field.name().clone(), DataType::Float32), + _ => Field::new(field.name().clone(), DataType::Float64), }) }), ) @@ -1584,13 +1585,13 @@ impl Expr { pub fn cut( self, breaks: Vec, - labels: Option>, + labels: Option>, left_closed: bool, include_breaks: bool, ) -> Expr { self.apply_private(FunctionExpr::Cut { breaks, - labels, + labels: labels.map(|x| x.into_vec()), left_closed, include_breaks, }) @@ -1605,14 +1606,14 @@ impl Expr { pub fn qcut( self, probs: Vec, - labels: Option>, + labels: Option>, left_closed: bool, allow_duplicates: bool, include_breaks: bool, ) -> Expr { self.apply_private(FunctionExpr::QCut { probs, - labels, + labels: labels.map(|x| x.into_vec()), left_closed, allow_duplicates, include_breaks, @@ -1628,7 +1629,7 @@ impl Expr { pub fn qcut_uniform( self, n_bins: usize, - labels: Option>, + labels: Option>, left_closed: bool, allow_duplicates: bool, include_breaks: bool, @@ -1636,7 +1637,7 @@ impl Expr { let probs = (1..n_bins).map(|b| b as f64 / n_bins as f64).collect(); self.apply_private(FunctionExpr::QCut { probs, - labels, + labels: labels.map(|x| x.into_vec()), left_closed, allow_duplicates, include_breaks, @@ -1797,11 +1798,11 @@ impl Expr { #[cfg(feature = "dtype-struct")] /// Count all unique values and create a struct mapping value to count. /// (Note that it is better to turn parallel off in the aggregation context). - pub fn value_counts(self, sort: bool, parallel: bool, name: String, normalize: bool) -> Self { + pub fn value_counts(self, sort: bool, parallel: bool, name: &str, normalize: bool) -> Self { self.apply_private(FunctionExpr::ValueCounts { sort, parallel, - name, + name: name.into(), normalize, }) .with_function_options(|mut opts| { diff --git a/crates/polars-plan/src/dsl/name.rs b/crates/polars-plan/src/dsl/name.rs index ab7231b2e151..81d084783616 100644 --- a/crates/polars-plan/src/dsl/name.rs +++ b/crates/polars-plan/src/dsl/name.rs @@ -1,5 +1,6 @@ +use polars_utils::format_pl_smallstr; #[cfg(feature = "dtype-struct")] -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use super::*; @@ -27,7 +28,7 @@ impl ExprNameNameSpace { /// Define an alias by mapping a function over the original root column name. pub fn map(self, function: F) -> Expr where - F: Fn(&str) -> PolarsResult + 'static + Send + Sync, + F: Fn(&PlSmallStr) -> PolarsResult + 'static + Send + Sync, { let function = SpecialEq::new(Arc::new(function) as Arc); Expr::RenameAlias { @@ -39,25 +40,25 @@ impl ExprNameNameSpace { /// Add a prefix to the root column name. pub fn prefix(self, prefix: &str) -> Expr { let prefix = prefix.to_string(); - self.map(move |name| Ok(format!("{prefix}{name}"))) + self.map(move |name| Ok(format_pl_smallstr!("{prefix}{name}"))) } /// Add a suffix to the root column name. pub fn suffix(self, suffix: &str) -> Expr { let suffix = suffix.to_string(); - self.map(move |name| Ok(format!("{name}{suffix}"))) + self.map(move |name| Ok(format_pl_smallstr!("{name}{suffix}"))) } /// Update the root column name to use lowercase characters. #[allow(clippy::wrong_self_convention)] pub fn to_lowercase(self) -> Expr { - self.map(move |name| Ok(name.to_lowercase())) + self.map(move |name| Ok(PlSmallStr::from_string(name.to_lowercase()))) } /// Update the root column name to use uppercase characters. #[allow(clippy::wrong_self_convention)] pub fn to_uppercase(self) -> Expr { - self.map(move |name| Ok(name.to_uppercase())) + self.map(move |name| Ok(PlSmallStr::from_string(name.to_uppercase()))) } #[cfg(feature = "dtype-struct")] @@ -71,11 +72,11 @@ impl ExprNameNameSpace { .iter() .map(|fd| { let mut fd = fd.clone(); - fd.rename(&function(fd.name())); + fd.rename(function(fd.name())); fd }) .collect::>(); - let mut out = StructChunked::from_series(s.name(), &fields)?; + let mut out = StructChunked::from_series(s.name().clone(), &fields)?; out.zip_outer_validity(s); Ok(Some(out.into_series())) }, @@ -83,7 +84,7 @@ impl ExprNameNameSpace { DataType::Struct(fds) => { let fields = fds .iter() - .map(|fd| Field::new(&f(fd.name()), fd.data_type().clone())) + .map(|fd| Field::new(f(fd.name()), fd.data_type().clone())) .collect(); Ok(DataType::Struct(fields)) }, @@ -96,7 +97,7 @@ impl ExprNameNameSpace { pub fn prefix_fields(self, prefix: &str) -> Expr { self.0 .map_private(FunctionExpr::StructExpr(StructFunction::PrefixFields( - ColumnName::from(prefix), + PlSmallStr::from_str(prefix), ))) } @@ -104,10 +105,10 @@ impl ExprNameNameSpace { pub fn suffix_fields(self, suffix: &str) -> Expr { self.0 .map_private(FunctionExpr::StructExpr(StructFunction::SuffixFields( - ColumnName::from(suffix), + PlSmallStr::from_str(suffix), ))) } } #[cfg(feature = "dtype-struct")] -pub type FieldsNameMapper = Arc SmartString + Send + Sync>; +pub type FieldsNameMapper = Arc PlSmallStr + Send + Sync>; diff --git a/crates/polars-plan/src/dsl/options.rs b/crates/polars-plan/src/dsl/options.rs index d8181cf887c0..a4d9ae84cd73 100644 --- a/crates/polars-plan/src/dsl/options.rs +++ b/crates/polars-plan/src/dsl/options.rs @@ -1,8 +1,7 @@ -use std::sync::Arc; - use polars_ops::prelude::{JoinArgs, JoinType}; #[cfg(feature = "dynamic_group_by")] use polars_time::RollingGroupOptions; +use polars_utils::pl_str::PlSmallStr; use polars_utils::IdxSize; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -21,7 +20,7 @@ pub struct RollingCovOptions { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct StrptimeOptions { /// Formatting string - pub format: Option, + pub format: Option, /// If set then polars will return an error if any date parsing fails pub strict: bool, /// If polars may parse matches that not contain the whole string @@ -115,6 +114,6 @@ pub enum NestedType { pub struct UnpivotArgsDSL { pub on: Vec, pub index: Vec, - pub variable_name: Option>, - pub value_name: Option>, + pub variable_name: Option, + pub value_name: Option, } diff --git a/crates/polars-plan/src/dsl/python_udf.rs b/crates/polars-plan/src/dsl/python_udf.rs index 5fcd5e9b797a..d31a659c7be7 100644 --- a/crates/polars-plan/src/dsl/python_udf.rs +++ b/crates/polars-plan/src/dsl/python_udf.rs @@ -218,7 +218,7 @@ impl SeriesUdf for PythonUdfExpression { let output_type = self.output_type.clone(); Some(GetOutput::map_field(move |fld| { Ok(match output_type { - Some(ref dt) => Field::new(fld.name(), dt.clone()), + Some(ref dt) => Field::new(fld.name().clone(), dt.clone()), None => { let mut fld = fld.clone(); fld.coerce(DataType::Unknown(Default::default())); @@ -243,7 +243,7 @@ impl Expr { let return_dtype = func.output_type.clone(); let output_type = GetOutput::map_field(move |fld| { Ok(match return_dtype { - Some(ref dt) => Field::new(fld.name(), dt.clone()), + Some(ref dt) => Field::new(fld.name().clone(), dt.clone()), None => { let mut fld = fld.clone(); fld.coerce(DataType::Unknown(Default::default())); diff --git a/crates/polars-plan/src/dsl/selector.rs b/crates/polars-plan/src/dsl/selector.rs index c8829696639f..16e7d7b374e0 100644 --- a/crates/polars-plan/src/dsl/selector.rs +++ b/crates/polars-plan/src/dsl/selector.rs @@ -58,18 +58,18 @@ impl Sub for Selector { impl From<&str> for Selector { fn from(value: &str) -> Self { - Selector::new(col(value)) + Selector::new(col(PlSmallStr::from_str(value))) } } impl From for Selector { fn from(value: String) -> Self { - Selector::new(col(value.as_ref())) + Selector::new(col(PlSmallStr::from_string(value))) } } -impl From for Selector { - fn from(value: ColumnName) -> Self { +impl From for Selector { + fn from(value: PlSmallStr) -> Self { Selector::new(Expr::Column(value)) } } diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index 5b62598eeb40..d392d403d1b6 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -161,6 +161,8 @@ impl StringNameSpace { pub fn extract_groups(self, pat: &str) -> PolarsResult { // regex will be compiled twice, because it doesn't support serde // and we need to compile it here to determine the output datatype + + use polars_utils::format_pl_smallstr; let reg = regex::Regex::new(pat)?; let names = reg .capture_names() @@ -168,22 +170,22 @@ impl StringNameSpace { .skip(1) .map(|(idx, opt_name)| { opt_name - .map(|name| name.to_string()) - .unwrap_or_else(|| format!("{idx}")) + .map(PlSmallStr::from_str) + .unwrap_or_else(|| format_pl_smallstr!("{idx}")) }) .collect::>(); let dtype = DataType::Struct( names .iter() - .map(|name| Field::new(name.as_str(), DataType::String)) + .map(|name| Field::new(name.clone(), DataType::String)) .collect(), ); Ok(self.0.map_private( StringFunction::ExtractGroups { dtype, - pat: pat.to_string(), + pat: pat.into(), } .into(), )) @@ -333,7 +335,7 @@ impl StringNameSpace { self.0 .apply_private( StringFunction::ConcatVertical { - delimiter: delimiter.to_owned(), + delimiter: delimiter.into(), ignore_nulls, } .into(), diff --git a/crates/polars-plan/src/dsl/struct_.rs b/crates/polars-plan/src/dsl/struct_.rs index b5a1afafa698..fe7ab700f21a 100644 --- a/crates/polars-plan/src/dsl/struct_.rs +++ b/crates/polars-plan/src/dsl/struct_.rs @@ -18,16 +18,15 @@ impl StructNameSpace { /// Retrieve one or multiple of the fields of this [`StructChunked`] as a new Series. /// This expression also expands the `"*"` wildcard column. - pub fn field_by_names>(self, names: &[S]) -> Expr { - self.field_by_names_impl( - names - .iter() - .map(|name| ColumnName::from(name.as_ref())) - .collect(), - ) + pub fn field_by_names(self, names: I) -> Expr + where + I: IntoIterator, + S: Into, + { + self.field_by_names_impl(names.into_iter().map(|x| x.into()).collect()) } - fn field_by_names_impl(self, names: Arc<[ColumnName]>) -> Expr { + fn field_by_names_impl(self, names: Arc<[PlSmallStr]>) -> Expr { self.0 .map_private(FunctionExpr::StructExpr(StructFunction::MultipleFields( names, @@ -42,11 +41,11 @@ impl StructNameSpace { /// This expression also supports wildcard "*" and regex expansion. pub fn field_by_name(self, name: &str) -> Expr { if name == "*" || is_regex_projection(name) { - return self.field_by_names(&[name]); + return self.field_by_names(&[name][..]); } self.0 .map_private(FunctionExpr::StructExpr(StructFunction::FieldByName( - ColumnName::from(name), + name.into(), ))) .with_function_options(|mut options| { options.flags |= FunctionFlags::ALLOW_RENAME; @@ -55,10 +54,18 @@ impl StructNameSpace { } /// Rename the fields of the [`StructChunked`]. - pub fn rename_fields(self, names: Vec) -> Expr { + pub fn rename_fields(self, names: I) -> Expr + where + I: IntoIterator, + S: Into, + { + self._rename_fields_impl(names.into_iter().map(|x| x.into()).collect()) + } + + pub fn _rename_fields_impl(self, names: Arc<[PlSmallStr]>) -> Expr { self.0 .map_private(FunctionExpr::StructExpr(StructFunction::RenameFields( - Arc::from(names), + names, ))) } diff --git a/crates/polars-plan/src/dsl/udf.rs b/crates/polars-plan/src/dsl/udf.rs index 35f59bc78df1..fe01cab03ea2 100644 --- a/crates/polars-plan/src/dsl/udf.rs +++ b/crates/polars-plan/src/dsl/udf.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use arrow::legacy::error::{polars_bail, PolarsResult}; use polars_core::prelude::Field; use polars_core::schema::Schema; +use polars_utils::pl_str::PlSmallStr; use super::{Expr, GetOutput, SeriesUdf, SpecialEq}; use crate::prelude::{Context, FunctionOptions}; @@ -11,7 +12,7 @@ use crate::prelude::{Context, FunctionOptions}; #[derive(Clone)] pub struct UserDefinedFunction { /// name - pub name: String, + pub name: PlSmallStr, /// The function signature. pub input_fields: Vec, /// The function output type. @@ -36,13 +37,13 @@ impl std::fmt::Debug for UserDefinedFunction { impl UserDefinedFunction { /// Create a new UserDefinedFunction pub fn new( - name: &str, + name: PlSmallStr, input_fields: Vec, return_type: GetOutput, fun: impl SeriesUdf + 'static, ) -> Self { Self { - name: name.to_owned(), + name, input_fields, return_type, fun: SpecialEq::new(Arc::new(fun)), diff --git a/crates/polars-plan/src/frame/opt_state.rs b/crates/polars-plan/src/frame/opt_state.rs index d6ed31a12882..934f42e6109f 100644 --- a/crates/polars-plan/src/frame/opt_state.rs +++ b/crates/polars-plan/src/frame/opt_state.rs @@ -18,11 +18,9 @@ bitflags! { const FILE_CACHING = 1 << 6; /// Pushdown slices/limits. const SLICE_PUSHDOWN = 1 << 7; - #[cfg(feature = "cse")] /// Run common-subplan-elimination. This elides duplicate plans and caches their /// outputs. const COMM_SUBPLAN_ELIM = 1 << 8; - #[cfg(feature = "cse")] /// Run common-subexpression-elimination. This elides duplicate expressions and caches their /// outputs. const COMM_SUBEXPR_ELIM = 1 << 9; diff --git a/crates/polars-plan/src/plans/aexpr/mod.rs b/crates/polars-plan/src/plans/aexpr/mod.rs index 49e4a94a62a0..486dfa6c19e5 100644 --- a/crates/polars-plan/src/plans/aexpr/mod.rs +++ b/crates/polars-plan/src/plans/aexpr/mod.rs @@ -131,8 +131,8 @@ impl From for GroupByMethod { #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub enum AExpr { Explode(Node), - Alias(Node, ColumnName), - Column(ColumnName), + Alias(Node, PlSmallStr), + Column(PlSmallStr), Literal(LiteralValue), BinaryExpr { left: Node, @@ -202,8 +202,8 @@ pub enum AExpr { impl AExpr { #[cfg(feature = "cse")] - pub(crate) fn col(name: &str) -> Self { - AExpr::Column(ColumnName::from(name)) + pub(crate) fn col(name: PlSmallStr) -> Self { + AExpr::Column(name) } /// Any expression that is sensitive to the number of elements in a group /// - Aggregations diff --git a/crates/polars-plan/src/plans/aexpr/schema.rs b/crates/polars-plan/src/plans/aexpr/schema.rs index d3b24cc8cc79..89eed6b70c01 100644 --- a/crates/polars-plan/src/plans/aexpr/schema.rs +++ b/crates/polars-plan/src/plans/aexpr/schema.rs @@ -54,7 +54,7 @@ impl AExpr { match self { Len => { *nested = 0; - Ok(Field::new(LEN, IDX_DTYPE)) + Ok(Field::new(PlSmallStr::from_static(LEN), IDX_DTYPE)) }, Window { function, .. } => { let e = arena.get(*function); @@ -64,13 +64,13 @@ impl AExpr { let field = arena.get(*expr).to_field_impl(schema, arena, nested)?; if let List(inner) = field.data_type() { - Ok(Field::new(field.name(), *inner.clone())) + Ok(Field::new(field.name().clone(), *inner.clone())) } else { Ok(field) } }, Alias(expr, name) => Ok(Field::new( - name, + name.clone(), arena.get(*expr).to_field_impl(schema, arena, nested)?.dtype, )), Column(name) => schema @@ -80,7 +80,7 @@ impl AExpr { *nested = 0; Ok(match sv { LiteralValue::Series(s) => s.field().into_owned(), - _ => Field::new(sv.output_name(), sv.get_datatype()), + _ => Field::new(sv.output_name().clone(), sv.get_datatype()), }) }, BinaryExpr { left, right, op } => { @@ -100,9 +100,9 @@ impl AExpr { let out_field; let out_name = { out_field = arena.get(*left).to_field_impl(schema, arena, nested)?; - out_field.name().as_str() + out_field.name() }; - Field::new(out_name, Boolean) + Field::new(out_name.clone(), Boolean) }, Operator::TrueDivide => { return get_truediv_field(*left, *right, arena, schema, nested) @@ -213,7 +213,7 @@ impl AExpr { expr, data_type, .. } => { let field = arena.get(*expr).to_field_impl(schema, arena, nested)?; - Ok(Field::new(field.name(), data_type.clone())) + Ok(Field::new(field.name().clone(), data_type.clone())) }, Ternary { truthy, falsy, .. } => { let mut nested_truthy = *nested; @@ -287,7 +287,7 @@ fn func_args_to_fields( .get(e.node()) .to_field_impl(schema, arena, nested) .map(|mut field| { - field.name = e.output_name().into(); + field.name = e.output_name().clone(); field }) }) diff --git a/crates/polars-plan/src/plans/anonymous_scan.rs b/crates/polars-plan/src/plans/anonymous_scan.rs index d426b12f9af4..f4a641152091 100644 --- a/crates/polars-plan/src/plans/anonymous_scan.rs +++ b/crates/polars-plan/src/plans/anonymous_scan.rs @@ -8,7 +8,7 @@ use crate::dsl::Expr; pub struct AnonymousScanArgs { pub n_rows: Option, - pub with_columns: Option>, + pub with_columns: Option>, pub schema: SchemaRef, pub output_schema: Option, pub predicate: Option, diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index e2480acccd2a..60cc249ed48d 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -90,7 +90,7 @@ impl DslBuilder { use_statistics: bool, hive_options: HiveOptions, glob: bool, - include_file_paths: Option>, + include_file_paths: Option, ) -> PolarsResult { let paths = init_paths(paths); @@ -135,7 +135,7 @@ impl DslBuilder { rechunk: bool, cloud_options: Option, hive_options: HiveOptions, - include_file_paths: Option>, + include_file_paths: Option, ) -> PolarsResult { let paths = init_paths(paths); @@ -172,7 +172,7 @@ impl DslBuilder { cache: bool, cloud_options: Option, glob: bool, - include_file_paths: Option>, + include_file_paths: Option, ) -> PolarsResult { let paths = init_paths(paths); @@ -366,13 +366,10 @@ impl DslBuilder { .into() } - pub fn row_index(self, name: &str, offset: Option) -> Self { + pub fn row_index(self, name: PlSmallStr, offset: Option) -> Self { DslPlan::MapFunction { input: Arc::new(self.0), - function: DslFunction::RowIndex { - name: ColumnName::from(name), - offset, - }, + function: DslFunction::RowIndex { name, offset }, } .into() } @@ -445,7 +442,7 @@ impl DslBuilder { function: F, optimizations: AllowedOptimizations, schema: Option>, - name: &str, + name: PlSmallStr, ) -> Self where F: DataFrameUdf + 'static, @@ -460,7 +457,7 @@ impl DslBuilder { predicate_pd: optimizations.contains(OptFlags::PREDICATE_PUSHDOWN), projection_pd: optimizations.contains(OptFlags::PROJECTION_PUSHDOWN), streamable: optimizations.contains(OptFlags::STREAMING), - fmt_str: name.into(), + fmt_str: name, }), } .into() diff --git a/crates/polars-plan/src/plans/builder_ir.rs b/crates/polars-plan/src/plans/builder_ir.rs index f1f3b8089e2c..91546ff0357c 100644 --- a/crates/polars-plan/src/plans/builder_ir.rs +++ b/crates/polars-plan/src/plans/builder_ir.rs @@ -68,7 +68,7 @@ impl<'a> IRBuilder<'a> { let names = nodes .into_iter() .map(|node| match self.expr_arena.get(node.into()) { - AExpr::Column(name) => name.as_ref(), + AExpr::Column(name) => name, _ => unreachable!(), }); // This is a duplication of `project_simple` because we already borrow self.expr_arena :/ @@ -81,7 +81,7 @@ impl<'a> IRBuilder<'a> { .map(|name| { let dtype = input_schema.try_get(name)?; count += 1; - Ok(Field::new(name, dtype.clone())) + Ok(Field::new(name.clone(), dtype.clone())) }) .collect::>()?; @@ -96,10 +96,11 @@ impl<'a> IRBuilder<'a> { } } - pub(crate) fn project_simple<'c, I>(self, names: I) -> PolarsResult + pub(crate) fn project_simple(self, names: I) -> PolarsResult where - I: IntoIterator, + I: IntoIterator, I::IntoIter: ExactSizeIterator, + S: Into, { let names = names.into_iter(); // if len == 0, no projection has to be done. This is a select all operation. @@ -110,7 +111,8 @@ impl<'a> IRBuilder<'a> { let mut count = 0; let schema = names .map(|name| { - let dtype = input_schema.try_get(name)?; + let name: PlSmallStr = name.into(); + let dtype = input_schema.try_get(name.as_str())?; count += 1; Ok(Field::new(name, dtype.clone())) }) @@ -180,10 +182,7 @@ impl<'a> IRBuilder<'a> { .to_field(&schema, Context::Default, self.expr_arena) .unwrap(); - expr_irs.push(ExprIR::new( - node, - OutputName::ColumnLhs(ColumnName::from(field.name.as_ref())), - )); + expr_irs.push(ExprIR::new(node, OutputName::ColumnLhs(field.name.clone()))); new_schema.with_column(field.name().clone(), field.data_type().clone()); } @@ -197,7 +196,7 @@ impl<'a> IRBuilder<'a> { } // call this if the schema needs to be updated - pub(crate) fn explode(self, columns: Arc<[Arc]>) -> Self { + pub(crate) fn explode(self, columns: Arc<[PlSmallStr]>) -> Self { let lp = IR::MapFunction { input: self.root, function: FunctionIR::Explode { @@ -309,7 +308,7 @@ impl<'a> IRBuilder<'a> { self.add_alp(lp) } - pub fn row_index(self, name: Arc, offset: Option) -> Self { + pub fn row_index(self, name: PlSmallStr, offset: Option) -> Self { let lp = IR::MapFunction { input: self.root, function: FunctionIR::RowIndex { diff --git a/crates/polars-plan/src/plans/conversion/convert_utils.rs b/crates/polars-plan/src/plans/conversion/convert_utils.rs index f61e3d7e286d..373f28a405bb 100644 --- a/crates/polars-plan/src/plans/conversion/convert_utils.rs +++ b/crates/polars-plan/src/plans/conversion/convert_utils.rs @@ -21,7 +21,7 @@ pub(super) fn convert_st_union( let to_cast = input_schema.iter().zip(schema.iter_dtypes()).flat_map( |((left_name, left_type), st)| { if left_type != st { - Some(col(left_name.as_ref()).cast(st.clone())) + Some(col(left_name.clone()).cast(st.clone())) } else { None } @@ -84,7 +84,7 @@ pub(super) fn convert_diagonal_concat( for (name, dtype) in total_schema.iter() { // If a name from Total Schema is not present - append if lf_schema.get_field(name).is_none() { - columns_to_add.push(NULL.lit().cast(dtype.clone()).alias(name)) + columns_to_add.push(NULL.lit().cast(dtype.clone()).alias(name.clone())) } } let expr = to_expr_irs(columns_to_add, expr_arena)?; diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index b75b4d67b55b..2d8ee40614ee 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -20,9 +20,10 @@ fn expand_expressions( exprs: Vec, lp_arena: &Arena, expr_arena: &mut Arena, + opt_flags: &mut OptFlags, ) -> PolarsResult> { let schema = lp_arena.get(input).schema(lp_arena); - let exprs = rewrite_projections(exprs, &schema, &[])?; + let exprs = rewrite_projections(exprs, &schema, &[], opt_flags)?; to_expr_irs(exprs, expr_arena) } @@ -57,17 +58,18 @@ pub fn to_alp( expr_arena: &mut Arena, lp_arena: &mut Arena, // Only `SIMPLIFY_EXPR` and `TYPE_COERCION` are respected. - opt_state: &mut OptFlags, + opt_flags: &mut OptFlags, ) -> PolarsResult { let conversion_optimizer = ConversionOptimizer::new( - opt_state.contains(OptFlags::SIMPLIFY_EXPR), - opt_state.contains(OptFlags::TYPE_COERCION), + opt_flags.contains(OptFlags::SIMPLIFY_EXPR), + opt_flags.contains(OptFlags::TYPE_COERCION), ); let mut ctxt = ConversionContext { expr_arena, lp_arena, conversion_optimizer, + opt_flags, }; to_alp_impl(lp, &mut ctxt) @@ -77,6 +79,7 @@ struct ConversionContext<'a> { expr_arena: &'a mut Arena, lp_arena: &'a mut Arena, conversion_optimizer: ConversionOptimizer, + opt_flags: &'a mut OptFlags, } /// converts LogicalPlan to IR @@ -305,7 +308,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult { let mut input = to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(filter)))?; - let predicate = expand_filter(predicate, input, ctxt.lp_arena) + let predicate = expand_filter(predicate, input, ctxt.lp_arena, ctxt.opt_flags) .map_err(|e| e.context(failed_here!(filter)))?; let predicate_ae = to_expr_ir(predicate.clone(), ctxt.expr_arena)?; @@ -378,8 +381,8 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult PolarsResult PolarsResult PolarsResult PolarsResult { - Some(col(name).fill_nan(fill_value.clone()).alias(name)) - }, + DataType::Float32 | DataType::Float64 => Some( + col(name.clone()) + .fill_nan(fill_value.clone()) + .alias(name.clone()), + ), _ => None, }) .collect::>(); - let (exprs, schema) = - resolve_with_columns(exprs, input, ctxt.lp_arena, ctxt.expr_arena) - .map_err(|e| e.context(failed_here!(fill_nan)))?; + let (exprs, schema) = resolve_with_columns( + exprs, + input, + ctxt.lp_arena, + ctxt.expr_arena, + ctxt.opt_flags, + ) + .map_err(|e| e.context(failed_here!(fill_nan)))?; ctxt.conversion_optimizer .fill_scratch(&exprs, ctxt.expr_arena); @@ -733,22 +756,22 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult stats_helper( |dt| dt.is_numeric() || dt.is_bool(), - |name| col(name).var(ddof), + |name| col(name.clone()).var(ddof), &input_schema, ), StatsFunction::Std { ddof } => stats_helper( |dt| dt.is_numeric() || dt.is_bool(), - |name| col(name).std(ddof), + |name| col(name.clone()).std(ddof), &input_schema, ), StatsFunction::Quantile { quantile, interpol } => stats_helper( |dt| dt.is_numeric(), - |name| col(name).quantile(quantile.clone(), interpol), + |name| col(name.clone()).quantile(quantile.clone(), interpol), &input_schema, ), StatsFunction::Mean => stats_helper( |dt| dt.is_numeric() || dt.is_temporal() || dt == &DataType::Boolean, - |name| col(name).mean(), + |name| col(name.clone()).mean(), &input_schema, ), StatsFunction::Sum => stats_helper( @@ -757,18 +780,22 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult stats_helper( + |dt| dt.is_ord(), + |name| col(name.clone()).min(), + &input_schema, + ), + StatsFunction::Max => stats_helper( + |dt| dt.is_ord(), + |name| col(name.clone()).max(), &input_schema, ), - StatsFunction::Min => { - stats_helper(|dt| dt.is_ord(), |name| col(name).min(), &input_schema) - }, - StatsFunction::Max => { - stats_helper(|dt| dt.is_ord(), |name| col(name).max(), &input_schema) - }, StatsFunction::Median => stats_helper( |dt| dt.is_numeric() || dt.is_temporal() || dt == &DataType::Boolean, - |name| col(name).median(), + |name| col(name.clone()).median(), &input_schema, ), }; @@ -911,7 +938,12 @@ fn expand_scan_paths_with_hive_update( Ok(expanded_paths) } -fn expand_filter(predicate: Expr, input: Node, lp_arena: &Arena) -> PolarsResult { +fn expand_filter( + predicate: Expr, + input: Node, + lp_arena: &Arena, + opt_flags: &mut OptFlags, +) -> PolarsResult { let schema = lp_arena.get(input).schema(lp_arena); let predicate = if has_expr(&predicate, |e| match e { Expr::Column(name) => is_regex_projection(name), @@ -924,7 +956,7 @@ fn expand_filter(predicate: Expr, input: Node, lp_arena: &Arena) -> PolarsRe | Expr::Nth(_) => true, _ => false, }) { - let mut rewritten = rewrite_projections(vec![predicate], &schema, &[])?; + let mut rewritten = rewrite_projections(vec![predicate], &schema, &[], opt_flags)?; match rewritten.len() { 1 => { // all good @@ -971,10 +1003,11 @@ fn resolve_with_columns( input: Node, lp_arena: &Arena, expr_arena: &mut Arena, + opt_flags: &mut OptFlags, ) -> PolarsResult<(Vec, SchemaRef)> { let schema = lp_arena.get(input).schema(lp_arena); let mut new_schema = (**schema).clone(); - let (exprs, _) = prepare_projection(exprs, &schema)?; + let (exprs, _) = prepare_projection(exprs, &schema, opt_flags)?; let mut output_names = PlHashSet::with_capacity(exprs.len()); let mut arena = Arena::with_capacity(8); @@ -1008,10 +1041,11 @@ fn resolve_group_by( _options: &GroupbyOptions, lp_arena: &Arena, expr_arena: &mut Arena, + opt_flags: &mut OptFlags, ) -> PolarsResult<(Vec, Vec, SchemaRef)> { let current_schema = lp_arena.get(input).schema(lp_arena); let current_schema = current_schema.as_ref(); - let mut keys = rewrite_projections(keys, current_schema, &[])?; + let mut keys = rewrite_projections(keys, current_schema, &[], opt_flags)?; // Initialize schema from keys let mut schema = expressions_to_schema(&keys, current_schema, Context::Default)?; @@ -1023,16 +1057,16 @@ fn resolve_group_by( #[cfg(feature = "dynamic_group_by")] { if let Some(options) = _options.rolling.as_ref() { - let name = &options.index_column; - let dtype = current_schema.try_get(name)?; - keys.push(col(name)); + let name = options.index_column.clone(); + let dtype = current_schema.try_get(name.as_str())?; + keys.push(col(name.clone())); pop_keys = true; schema.with_column(name.clone(), dtype.clone()); } else if let Some(options) = _options.dynamic.as_ref() { - let name = &options.index_column; - keys.push(col(name)); + let name = options.index_column.clone(); + keys.push(col(name.clone())); pop_keys = true; - let dtype = current_schema.try_get(name)?; + let dtype = current_schema.try_get(name.as_str())?; if options.include_boundaries { schema.with_column("_lower_boundary".into(), dtype.clone()); schema.with_column("_upper_boundary".into(), dtype.clone()); @@ -1042,7 +1076,7 @@ fn resolve_group_by( } let keys_index_len = schema.len(); - let aggs = rewrite_projections(aggs, current_schema, &keys)?; + let aggs = rewrite_projections(aggs, current_schema, &keys, opt_flags)?; if pop_keys { let _ = keys.pop(); } @@ -1067,7 +1101,7 @@ fn resolve_group_by( fn stats_helper(condition: F, expr: E, schema: &Schema) -> Vec where F: Fn(&DataType) -> bool, - E: Fn(&str) -> Expr, + E: Fn(&PlSmallStr) -> Expr, { schema .iter() @@ -1075,7 +1109,7 @@ where if condition(dt) { expr(name) } else { - lit(NULL).cast(dt.clone()).alias(name) + lit(NULL).cast(dt.clone()).alias(name.clone()) } }) .collect() @@ -1084,7 +1118,7 @@ where pub(crate) fn maybe_init_projection_excluding_hive( reader_schema: &Either, hive_parts: Option<&HivePartitions>, -) -> Option> { +) -> Option> { // Update `with_columns` with a projection so that hive columns aren't loaded from the // file let hive_parts = hive_parts?; @@ -1095,19 +1129,20 @@ pub(crate) fn maybe_init_projection_excluding_hive( let names = match reader_schema { Either::Left(ref v) => { - let names = v.get_names(); - names.contains(&first_hive_name.as_str()).then_some(names) + let names = v.get_names_owned(); + names.contains(first_hive_name).then_some(names) }, - Either::Right(ref v) => v.contains(first_hive_name.as_str()).then(|| v.get_names()), + Either::Right(ref v) => v + .contains(first_hive_name.as_str()) + .then(|| v.get_names_owned()), }; let names = names?; Some( names - .iter() + .into_iter() .filter(|x| !hive_schema.contains(x)) - .map(ToString::to_string) .collect::>(), ) } diff --git a/crates/polars-plan/src/plans/conversion/expr_expansion.rs b/crates/polars-plan/src/plans/conversion/expr_expansion.rs index bcb6f957a7b2..46eed6758f44 100644 --- a/crates/polars-plan/src/plans/conversion/expr_expansion.rs +++ b/crates/polars-plan/src/plans/conversion/expr_expansion.rs @@ -6,15 +6,16 @@ use super::*; pub(crate) fn prepare_projection( exprs: Vec, schema: &Schema, + opt_flags: &mut OptFlags, ) -> PolarsResult<(Vec, Schema)> { - let exprs = rewrite_projections(exprs, schema, &[])?; + let exprs = rewrite_projections(exprs, schema, &[], opt_flags)?; let schema = expressions_to_schema(&exprs, schema, Context::Default)?; Ok((exprs, schema)) } /// This replaces the wildcard Expr with a Column Expr. It also removes the Exclude Expr from the /// expression chain. -pub(super) fn replace_wildcard_with_column(expr: Expr, column_name: Arc) -> Expr { +pub(super) fn replace_wildcard_with_column(expr: Expr, column_name: &PlSmallStr) -> Expr { expr.map_expr(|e| match e { Expr::Wildcard => Expr::Column(column_name.clone()), Expr::Exclude(input, _) => Arc::unwrap_or_clone(input), @@ -46,7 +47,7 @@ fn rewrite_special_aliases(expr: Expr) -> PolarsResult { Expr::RenameAlias { expr, function } => { let name = get_single_leaf(&expr).unwrap(); let name = function.call(&name)?; - Ok(Expr::Alias(expr, ColumnName::from(name))) + Ok(Expr::Alias(expr, name)) }, _ => { polars_bail!(InvalidOperation: "`keep`, `suffix`, `prefix` should be last expression") @@ -63,13 +64,12 @@ fn rewrite_special_aliases(expr: Expr) -> PolarsResult { fn replace_wildcard( expr: &Expr, result: &mut Vec, - exclude: &PlHashSet>, + exclude: &PlHashSet, schema: &Schema, ) -> PolarsResult<()> { for name in schema.iter_names() { if !exclude.contains(name.as_str()) { - let new_expr = - replace_wildcard_with_column(expr.clone(), ColumnName::from(name.as_str())); + let new_expr = replace_wildcard_with_column(expr.clone(), name); let new_expr = rewrite_special_aliases(new_expr)?; result.push(new_expr) } @@ -87,11 +87,11 @@ fn replace_nth(expr: Expr, schema: &Schema) -> Expr { -1 => "last", _ => "nth", }; - Expr::Column(ColumnName::from(name)) + Expr::Column(PlSmallStr::from_static(name)) }, Some(idx) => { let (name, _dtype) = schema.get_at_index(idx).unwrap(); - Expr::Column(ColumnName::from(&**name)) + Expr::Column(name.clone()) }, } } else { @@ -108,7 +108,7 @@ fn expand_regex( result: &mut Vec, schema: &Schema, pattern: &str, - exclude: &PlHashSet>, + exclude: &PlHashSet, ) -> PolarsResult<()> { let re = regex::Regex::new(pattern).map_err(|e| polars_err!(ComputeError: "invalid regex {}", e))?; @@ -117,9 +117,7 @@ fn expand_regex( let mut new_expr = remove_exclude(expr.clone()); new_expr = new_expr.map_expr(|e| match e { - Expr::Column(pat) if pat.as_ref() == pattern => { - Expr::Column(ColumnName::from(name.as_str())) - }, + Expr::Column(pat) if pat.as_ref() == pattern => Expr::Column(name.clone()), e => e, }); @@ -141,7 +139,7 @@ fn replace_regex( expr: &Expr, result: &mut Vec, schema: &Schema, - exclude: &PlHashSet>, + exclude: &PlHashSet, ) -> PolarsResult<()> { let roots = expr_to_leaf_column_names(expr); let mut regex = None; @@ -174,9 +172,9 @@ fn replace_regex( fn expand_columns( expr: &Expr, result: &mut Vec, - names: &[ColumnName], + names: &[PlSmallStr], schema: &Schema, - exclude: &PlHashSet, + exclude: &PlHashSet, ) -> PolarsResult<()> { let mut is_valid = true; for name in names { @@ -215,12 +213,10 @@ fn struct_index_to_field(expr: Expr, schema: &Schema) -> PolarsResult { polars_bail!(InvalidOperation: "expected 'struct' dtype, got {:?}", dtype) }; let index = index.try_negative_to_usize(fields.len())?; - let name = fields[index].name.as_str(); + let name = fields[index].name.clone(); Ok(Expr::Function { input, - function: FunctionExpr::StructExpr(StructFunction::FieldByName( - ColumnName::from(name), - )), + function: FunctionExpr::StructExpr(StructFunction::FieldByName(name)), options, }) } else { @@ -239,7 +235,7 @@ fn struct_index_to_field(expr: Expr, schema: &Schema) -> PolarsResult { /// ()It also removes the Exclude Expr from the expression chain). fn replace_dtype_or_index_with_column( expr: Expr, - column_name: &ColumnName, + column_name: &PlSmallStr, replace_dtype: bool, ) -> Expr { expr.map_expr(|e| match e { @@ -254,8 +250,8 @@ fn replace_dtype_or_index_with_column( /// expression chain. pub(super) fn replace_columns_with_column( mut expr: Expr, - names: &[ColumnName], - column_name: &ColumnName, + names: &[PlSmallStr], + column_name: &PlSmallStr, ) -> (Expr, bool) { let mut is_valid = true; expr = expr.map_expr(|e| match e { @@ -294,7 +290,7 @@ fn expand_dtypes( result: &mut Vec, schema: &Schema, dtypes: &[DataType], - exclude: &PlHashSet>, + exclude: &PlHashSet, ) -> PolarsResult<()> { // note: we loop over the schema to guarantee that we return a stable // field-order, irrespective of which dtypes are filtered against @@ -304,8 +300,7 @@ fn expand_dtypes( }) { let name = field.name(); let new_expr = expr.clone(); - let new_expr = - replace_dtype_or_index_with_column(new_expr, &ColumnName::from(name.as_str()), true); + let new_expr = replace_dtype_or_index_with_column(new_expr, name, true); let new_expr = rewrite_special_aliases(new_expr)?; result.push(new_expr) } @@ -315,7 +310,7 @@ fn expand_dtypes( #[cfg(feature = "dtype-struct")] fn replace_struct_multiple_fields_with_field( expr: Expr, - column_name: &ColumnName, + column_name: &PlSmallStr, ) -> PolarsResult { let mut count = 0; let out = expr.map_expr(|e| match e { @@ -356,8 +351,8 @@ fn expand_struct_fields( full_expr: &Expr, result: &mut Vec, schema: &Schema, - names: &[ColumnName], - exclude: &PlHashSet>, + names: &[PlSmallStr], + exclude: &PlHashSet, ) -> PolarsResult<()> { let first_name = names[0].as_ref(); if names.len() == 1 && first_name == "*" || is_regex_projection(first_name) { @@ -374,12 +369,12 @@ fn expand_struct_fields( fields .iter() .flat_map(|field| { - let name = field.name().as_str(); + let name = field.name(); - if exclude.contains(name) { + if exclude.contains(name.as_str()) { None } else { - Some(Arc::from(field.name().as_str())) + Some(name.clone()) } }) .collect::>() @@ -394,11 +389,11 @@ fn expand_struct_fields( fields .iter() .flat_map(|field| { - let name = field.name().as_str(); - if exclude.contains(name) || !re.is_match(name) { + let name = field.name(); + if exclude.contains(name.as_str()) || !re.is_match(name.as_str()) { None } else { - Some(Arc::from(field.name().as_str())) + Some(name.clone()) } }) .collect::>() @@ -409,7 +404,14 @@ fn expand_struct_fields( } }; - return expand_struct_fields(struct_expr, full_expr, result, schema, &names, exclude); + return expand_struct_fields( + struct_expr, + full_expr, + result, + schema, + names.as_slice(), + exclude, + ); } for name in names { @@ -423,7 +425,7 @@ fn expand_struct_fields( }, Expr::RenameAlias { expr, function } => { let name = function.call(name)?; - new_expr = Expr::Alias(expr, ColumnName::from(name)); + new_expr = Expr::Alias(expr, name); }, _ => {}, } @@ -440,7 +442,7 @@ fn expand_indices( result: &mut Vec, schema: &Schema, indices: &[i64], - exclude: &PlHashSet>, + exclude: &PlHashSet, ) -> PolarsResult<()> { let n_fields = schema.len() as i64; for idx in indices { @@ -454,11 +456,7 @@ fn expand_indices( if let Some((name, _)) = schema.get_at_index(idx as usize) { if !exclude.contains(name.as_str()) { let new_expr = expr.clone(); - let new_expr = replace_dtype_or_index_with_column( - new_expr, - &ColumnName::from(name.as_str()), - false, - ); + let new_expr = replace_dtype_or_index_with_column(new_expr, name, false); let new_expr = rewrite_special_aliases(new_expr)?; result.push(new_expr); } @@ -474,7 +472,7 @@ fn prepare_excluded( schema: &Schema, keys: &[Expr], has_exclude: bool, -) -> PolarsResult>> { +) -> PolarsResult> { let mut exclude = PlHashSet::new(); // explicit exclude branch @@ -502,7 +500,7 @@ fn prepare_excluded( Excluded::Dtype(dt) => { for fld in schema.iter_fields() { if dtypes_match(fld.data_type(), dt) { - exclude.insert(ColumnName::from(fld.name().as_ref())); + exclude.insert(fld.name.clone()); } } }, @@ -520,7 +518,7 @@ fn prepare_excluded( Excluded::Dtype(dt) => { for (name, dtype) in schema.iter() { if matches!(dtype, dt) { - exclude.insert(ColumnName::from(name.as_str())); + exclude.insert(name.clone()); } } }, @@ -541,14 +539,18 @@ fn prepare_excluded( } // functions can have col(["a", "b"]) or col(String) as inputs -fn expand_function_inputs(expr: Expr, schema: &Schema) -> PolarsResult { +fn expand_function_inputs( + expr: Expr, + schema: &Schema, + opt_flags: &mut OptFlags, +) -> PolarsResult { expr.try_map_expr(|mut e| match &mut e { Expr::AnonymousFunction { input, options, .. } | Expr::Function { input, options, .. } if options .flags .contains(FunctionFlags::INPUT_WILDCARD_EXPANSION) => { - *input = rewrite_projections(core::mem::take(input), schema, &[]).unwrap(); + *input = rewrite_projections(core::mem::take(input), schema, &[], opt_flags).unwrap(); if input.is_empty() && !options.flags.contains(FunctionFlags::ALLOW_EMPTY_INPUTS) { // Needed to visualize the error *input = vec![Expr::Literal(LiteralValue::Null)]; @@ -639,12 +641,27 @@ fn find_flags(expr: &Expr) -> PolarsResult { }) } +#[cfg(feature = "dtype-struct")] +fn toggle_cse(opt_flags: &mut OptFlags) { + if opt_flags.contains(OptFlags::EAGER) { + #[cfg(debug_assertions)] + { + use polars_core::config::verbose; + if verbose() { + eprintln!("CSE turned on because of struct expansion") + } + } + *opt_flags |= OptFlags::COMM_SUBEXPR_ELIM; + } +} + /// In case of single col(*) -> do nothing, no selection is the same as select all /// In other cases replace the wildcard with an expression with all columns pub(crate) fn rewrite_projections( exprs: Vec, schema: &Schema, keys: &[Expr], + opt_flags: &mut OptFlags, ) -> PolarsResult> { let mut result = Vec::with_capacity(exprs.len() + schema.len()); @@ -653,7 +670,7 @@ pub(crate) fn rewrite_projections( let result_offset = result.len(); // Functions can have col(["a", "b"]) or col(String) as inputs. - expr = expand_function_inputs(expr, schema)?; + expr = expand_function_inputs(expr, schema, opt_flags)?; let mut flags = find_flags(&expr)?; if flags.has_selector { @@ -662,10 +679,11 @@ pub(crate) fn rewrite_projections( flags.multiple_columns = true; } - replace_and_add_to_results(expr, flags, &mut result, schema, keys)?; + replace_and_add_to_results(expr, flags, &mut result, schema, keys, opt_flags)?; #[cfg(feature = "dtype-struct")] if flags.has_struct_field_by_index { + toggle_cse(opt_flags); for e in &mut result[result_offset..] { *e = struct_index_to_field(std::mem::take(e), schema)?; } @@ -680,6 +698,7 @@ fn replace_and_add_to_results( result: &mut Vec, schema: &Schema, keys: &[Expr], + opt_flags: &mut OptFlags, ) -> PolarsResult<()> { if flags.has_nth { expr = replace_nth(expr, schema); @@ -732,6 +751,7 @@ fn replace_and_add_to_results( &mut intermediate, schema, keys, + opt_flags, )?; // Then expand the fields and add to the final result vec. @@ -739,12 +759,13 @@ fn replace_and_add_to_results( flags.multiple_columns = false; flags.has_wildcard = false; for e in intermediate { - replace_and_add_to_results(e, flags, result, schema, keys)?; + replace_and_add_to_results(e, flags, result, schema, keys, opt_flags)?; } } // has only field expansion // col('a').struct.field('*') else { + toggle_cse(opt_flags); expand_struct_fields(e, &expr, result, schema, names, &exclude)? } }, @@ -787,7 +808,14 @@ fn replace_selector_inner( match s { Selector::Root(expr) => { let local_flags = find_flags(&expr)?; - replace_and_add_to_results(*expr, local_flags, scratch, schema, keys)?; + replace_and_add_to_results( + *expr, + local_flags, + scratch, + schema, + keys, + &mut Default::default(), + )?; members.extend(scratch.drain(..)) }, Selector::Add(lhs, rhs) => { @@ -851,7 +879,7 @@ pub(crate) fn expand_selectors( s: Vec, schema: &Schema, keys: &[Expr], -) -> PolarsResult> { +) -> PolarsResult> { let mut columns = vec![]; // Skip the column fast paths. @@ -889,7 +917,7 @@ pub(super) fn expand_selector( s: Selector, schema: &Schema, keys: &[Expr], -) -> PolarsResult> { +) -> PolarsResult> { let mut members = PlIndexSet::new(); replace_selector_inner(s, &mut members, &mut vec![], schema, keys)?; @@ -907,7 +935,7 @@ pub(super) fn expand_selector( // Ensure that multiple columns returned from combined/nested selectors remain in schema order let selected = schema .iter_fields() - .map(|field| ColumnName::from(field.name().as_ref())) + .map(|field| field.name().clone()) .filter(|field_name| members.contains(&Expr::Column(field_name.clone()))) .collect(); diff --git a/crates/polars-plan/src/plans/conversion/expr_to_ir.rs b/crates/polars-plan/src/plans/conversion/expr_to_ir.rs index ff5f78075610..bcfacb7f0dc6 100644 --- a/crates/polars-plan/src/plans/conversion/expr_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/expr_to_ir.rs @@ -74,7 +74,8 @@ where { if state.output_name.is_none() { if e.is_empty() { - state.output_name = OutputName::LiteralLhs(ColumnName::from(function_fmt().as_ref())); + let s = function_fmt(); + state.output_name = OutputName::LiteralLhs(PlSmallStr::from_str(s.as_ref())); } else { state.output_name = e[0].output_name_inner().clone(); } @@ -137,7 +138,7 @@ fn to_aexpr_impl( }, Expr::Literal(lv) => { if state.output_name.is_none() { - state.output_name = OutputName::LiteralLhs(lv.output_column_name()); + state.output_name = OutputName::LiteralLhs(lv.output_column_name().clone()); } AExpr::Literal(lv) }, diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index 8d7e232c4cd7..28c41039d4b8 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -141,7 +141,7 @@ impl IR { let input = convert_to_lp(input, lp_arena); let expr = columns .iter_names() - .map(|name| Expr::Column(ColumnName::from(name.as_str()))) + .map(|name| Expr::Column(name.clone())) .collect::>(); DslPlan::Select { expr, diff --git a/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs b/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs index f983cf20b76e..230652e101f7 100644 --- a/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs @@ -622,7 +622,7 @@ mod test { let rules: &mut [Box] = &mut [Box::new(TypeCoercionRule {})]; let df = DataFrame::new(Vec::from([Series::new_empty( - "fruits", + PlSmallStr::from_static("fruits"), &DataType::Categorical(None, Default::default()), )])) .unwrap(); diff --git a/crates/polars-plan/src/plans/expr_ir.rs b/crates/polars-plan/src/plans/expr_ir.rs index d9c0886c201c..8512fdc8d8ea 100644 --- a/crates/polars-plan/src/plans/expr_ir.rs +++ b/crates/polars-plan/src/plans/expr_ir.rs @@ -3,11 +3,12 @@ use std::hash::Hash; #[cfg(feature = "cse")] use std::hash::Hasher; +use polars_utils::format_pl_smallstr; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; use super::*; -use crate::constants::{get_len_name, LITERAL_NAME}; +use crate::constants::{get_len_name, get_literal_name}; #[derive(Default, Debug, Clone, Hash, PartialEq, Eq)] #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] @@ -16,18 +17,18 @@ pub enum OutputName { #[default] None, /// The most left-hand-side literal will be the output name. - LiteralLhs(ColumnName), + LiteralLhs(PlSmallStr), /// The most left-hand-side column will be the output name. - ColumnLhs(ColumnName), - /// Rename the output as `ColumnName`. - Alias(ColumnName), + ColumnLhs(PlSmallStr), + /// Rename the output as `PlSmallStr`. + Alias(PlSmallStr), #[cfg(feature = "dtype-struct")] /// A struct field. - Field(ColumnName), + Field(PlSmallStr), } impl OutputName { - pub fn unwrap(&self) -> &ColumnName { + pub fn unwrap(&self) -> &PlSmallStr { match self { OutputName::Alias(name) => name, OutputName::ColumnLhs(name) => name, @@ -79,9 +80,9 @@ impl ExprIR { }, AExpr::Literal(lv) => { if let LiteralValue::Series(s) = lv { - out.output_name = OutputName::LiteralLhs(s.name().into()); + out.output_name = OutputName::LiteralLhs(s.name().clone()); } else { - out.output_name = OutputName::LiteralLhs(LITERAL_NAME.into()); + out.output_name = OutputName::LiteralLhs(get_literal_name().clone()); } break; }, @@ -95,9 +96,8 @@ impl ExprIR { }, _ => { if input.is_empty() { - out.output_name = OutputName::LiteralLhs(ColumnName::from( - format!("{}", function), - )); + out.output_name = + OutputName::LiteralLhs(format_pl_smallstr!("{}", function)); } else { out.output_name = input[0].output_name.clone(); } @@ -107,7 +107,8 @@ impl ExprIR { }, AExpr::AnonymousFunction { input, options, .. } => { if input.is_empty() { - out.output_name = OutputName::LiteralLhs(ColumnName::from(options.fmt_str)); + out.output_name = + OutputName::LiteralLhs(PlSmallStr::from_static(options.fmt_str)); } else { out.output_name = input[0].output_name.clone(); } @@ -147,7 +148,7 @@ impl ExprIR { } #[cfg(feature = "cse")] - pub(crate) fn set_alias(&mut self, name: ColumnName) { + pub(crate) fn set_alias(&mut self, name: PlSmallStr) { self.output_name = OutputName::Alias(name) } @@ -155,24 +156,20 @@ impl ExprIR { &self.output_name } - pub(crate) fn output_name_arc(&self) -> &Arc { + pub fn output_name(&self) -> &PlSmallStr { self.output_name.unwrap() } - pub fn output_name(&self) -> &str { - self.output_name_arc().as_ref() - } - pub fn to_expr(&self, expr_arena: &Arena) -> Expr { let out = node_to_expr(self.node, expr_arena); match &self.output_name { - OutputName::Alias(name) => out.alias(name.as_ref()), + OutputName::Alias(name) => out.alias(name.clone()), _ => out, } } - pub fn get_alias(&self) -> Option<&ColumnName> { + pub fn get_alias(&self) -> Option<&PlSmallStr> { match &self.output_name { OutputName::Alias(name) => Some(name), _ => None, @@ -180,7 +177,7 @@ impl ExprIR { } /// Gets any name except one deriving from `Column`. - pub(crate) fn get_non_projected_name(&self) -> Option<&ColumnName> { + pub(crate) fn get_non_projected_name(&self) -> Option<&PlSmallStr> { match &self.output_name { OutputName::Alias(name) => Some(name), #[cfg(feature = "dtype-struct")] @@ -232,20 +229,20 @@ impl From<&ExprIR> for Node { } } -pub(crate) fn name_to_expr_ir(name: &str, expr_arena: &mut Arena) -> ExprIR { - let name = ColumnName::from(name); +pub(crate) fn name_to_expr_ir(name: PlSmallStr, expr_arena: &mut Arena) -> ExprIR { let node = expr_arena.add(AExpr::Column(name.clone())); ExprIR::new(node, OutputName::ColumnLhs(name)) } -pub(crate) fn names_to_expr_irs, S: AsRef>( - names: I, - expr_arena: &mut Arena, -) -> Vec { +pub(crate) fn names_to_expr_irs(names: I, expr_arena: &mut Arena) -> Vec +where + I: IntoIterator, + S: Into, +{ names .into_iter() .map(|name| { - let name = name.as_ref(); + let name = name.into(); name_to_expr_ir(name, expr_arena) }) .collect() diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index d00f19e36f8a..bd68db61a06c 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -86,7 +86,10 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu let count: IdxSize = count.try_into().map_err( |_| polars_err!(ComputeError: "count of {} exceeded maximum row size", count), )?; - DataFrame::new(vec![Series::new(crate::constants::LEN, [count])]) + DataFrame::new(vec![Series::new( + PlSmallStr::from_static(crate::constants::LEN), + [count], + )]) } } #[cfg(feature = "parquet")] diff --git a/crates/polars-plan/src/plans/functions/dsl.rs b/crates/polars-plan/src/plans/functions/dsl.rs index 458c7c6d8e28..7b7e43e797eb 100644 --- a/crates/polars-plan/src/plans/functions/dsl.rs +++ b/crates/polars-plan/src/plans/functions/dsl.rs @@ -36,12 +36,12 @@ pub enum DslFunction { args: UnpivotArgsDSL, }, RowIndex { - name: Arc, + name: PlSmallStr, offset: Option, }, Rename { - existing: Arc<[SmartString]>, - new: Arc<[SmartString]>, + existing: Arc<[PlSmallStr]>, + new: Arc<[PlSmallStr]>, }, Unnest(Vec), Stats(StatsFunction), diff --git a/crates/polars-plan/src/plans/functions/explode.rs b/crates/polars-plan/src/plans/functions/explode.rs index 0103ed5f2818..a5140d81103b 100644 --- a/crates/polars-plan/src/plans/functions/explode.rs +++ b/crates/polars-plan/src/plans/functions/explode.rs @@ -1,5 +1,5 @@ use super::*; -pub(super) fn explode_impl(df: DataFrame, columns: &[SmartString]) -> PolarsResult { +pub(super) fn explode_impl(df: DataFrame, columns: &[PlSmallStr]) -> PolarsResult { df.explode(columns) } diff --git a/crates/polars-plan/src/plans/functions/merge_sorted.rs b/crates/polars-plan/src/plans/functions/merge_sorted.rs index a20a85d68812..ffc9e1f04df6 100644 --- a/crates/polars-plan/src/plans/functions/merge_sorted.rs +++ b/crates/polars-plan/src/plans/functions/merge_sorted.rs @@ -10,7 +10,7 @@ pub(super) fn merge_sorted(df: &DataFrame, column: &str) -> PolarsResult PolarsResult>, scan_type: FileScan, - alias: Option>, + alias: Option, }, /// Streaming engine pipeline #[cfg_attr(feature = "ir_serde", serde(skip))] @@ -57,7 +57,7 @@ pub enum FunctionIR { original: Option>, }, Unnest { - columns: Arc<[ColumnName]>, + columns: Arc<[PlSmallStr]>, }, Rechunk, // The two DataFrames are temporary concatenated @@ -67,18 +67,18 @@ pub enum FunctionIR { #[cfg(feature = "merge_sorted")] MergeSorted { // sorted column that serves as the key - column: Arc, + column: PlSmallStr, }, Rename { - existing: Arc<[SmartString]>, - new: Arc<[SmartString]>, + existing: Arc<[PlSmallStr]>, + new: Arc<[PlSmallStr]>, // A column name gets swapped with an existing column swapping: bool, #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, }, Explode { - columns: Arc<[ColumnName]>, + columns: Arc<[PlSmallStr]>, #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, }, @@ -89,7 +89,7 @@ pub enum FunctionIR { schema: CachedSchema, }, RowIndex { - name: Arc, + name: PlSmallStr, // Might be cached. #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, @@ -238,7 +238,7 @@ impl FunctionIR { } } - pub(crate) fn additional_projection_pd_columns(&self) -> Cow<[Arc]> { + pub(crate) fn additional_projection_pd_columns(&self) -> Cow<[PlSmallStr]> { use FunctionIR::*; match self { Unnest { columns } => Cow::Borrowed(columns.as_ref()), @@ -272,7 +272,7 @@ impl FunctionIR { Unnest { columns: _columns } => { #[cfg(feature = "dtype-struct")] { - df.unnest(_columns.as_ref()) + df.unnest(_columns.iter().cloned()) } #[cfg(not(feature = "dtype-struct"))] { @@ -293,14 +293,14 @@ impl FunctionIR { } }, Rename { existing, new, .. } => rename::rename_impl(df, existing, new), - Explode { columns, .. } => df.explode(columns.as_ref()), + Explode { columns, .. } => df.explode(columns.iter().cloned()), #[cfg(feature = "pivot")] Unpivot { args, .. } => { use polars_ops::pivot::UnpivotDF; let args = (**args).clone(); df.unpivot2(args) }, - RowIndex { name, offset, .. } => df.with_row_index(name.as_ref(), *offset), + RowIndex { name, offset, .. } => df.with_row_index(name.clone(), *offset), } } diff --git a/crates/polars-plan/src/plans/functions/rename.rs b/crates/polars-plan/src/plans/functions/rename.rs index fea6c2cc635c..7a58101e3731 100644 --- a/crates/polars-plan/src/plans/functions/rename.rs +++ b/crates/polars-plan/src/plans/functions/rename.rs @@ -2,8 +2,8 @@ use super::*; pub(super) fn rename_impl( mut df: DataFrame, - existing: &[SmartString], - new: &[SmartString], + existing: &[PlSmallStr], + new: &[PlSmallStr], ) -> PolarsResult { let positions = existing .iter() @@ -14,7 +14,7 @@ pub(super) fn rename_impl( // the column might be removed due to projection pushdown // so we only update if we can find it. if let Some(pos) = pos { - unsafe { df.get_columns_mut()[*pos].rename(name) }; + unsafe { df.get_columns_mut()[*pos].rename(name.clone()) }; } } // recreate dataframe so we check duplicates diff --git a/crates/polars-plan/src/plans/functions/schema.rs b/crates/polars-plan/src/plans/functions/schema.rs index 58ae0a43609a..957bf17f090d 100644 --- a/crates/polars-plan/src/plans/functions/schema.rs +++ b/crates/polars-plan/src/plans/functions/schema.rs @@ -2,6 +2,7 @@ use polars_core::utils::try_get_supertype; use super::*; +use crate::constants::get_len_name; impl FunctionIR { pub(crate) fn clear_cached_schema(&self) { @@ -43,12 +44,7 @@ impl FunctionIR { Pipeline { schema, .. } => Ok(Cow::Owned(schema.clone())), FastCount { alias, .. } => { let mut schema: Schema = Schema::with_capacity(1); - let name = SmartString::from( - alias - .as_ref() - .map(|alias| alias.as_ref()) - .unwrap_or(crate::constants::LEN), - ); + let name = alias.clone().unwrap_or_else(get_len_name); schema.insert_at_index(0, name, IDX_DTYPE)?; Ok(Cow::Owned(Arc::new(schema))) }, @@ -97,9 +93,11 @@ impl FunctionIR { schema, .. } => rename_schema(input_schema, existing, new, schema), - RowIndex { schema, name, .. } => { - Ok(Cow::Owned(row_index_schema(schema, input_schema, name))) - }, + RowIndex { schema, name, .. } => Ok(Cow::Owned(row_index_schema( + schema, + input_schema, + name.clone(), + ))), Explode { schema, columns } => explode_schema(schema, input_schema, columns), #[cfg(feature = "pivot")] Unpivot { schema, args } => unpivot_schema(args, schema, input_schema), @@ -110,14 +108,14 @@ impl FunctionIR { fn row_index_schema( cached_schema: &CachedSchema, input_schema: &SchemaRef, - name: &str, + name: PlSmallStr, ) -> SchemaRef { let mut guard = cached_schema.lock().unwrap(); if let Some(schema) = &*guard { return schema.clone(); } let mut schema = (**input_schema).clone(); - schema.insert_at_index(0, name.into(), IDX_DTYPE).unwrap(); + schema.insert_at_index(0, name, IDX_DTYPE).unwrap(); let schema_ref = Arc::new(schema); *guard = Some(schema_ref.clone()); schema_ref @@ -126,7 +124,7 @@ fn row_index_schema( fn explode_schema<'a>( cached_schema: &CachedSchema, schema: &'a Schema, - columns: &[Arc], + columns: &[PlSmallStr], ) -> PolarsResult> { let mut guard = cached_schema.lock().unwrap(); if let Some(schema) = &*guard { @@ -161,7 +159,7 @@ fn unpivot_schema<'a>( let mut new_schema = args .index .iter() - .map(|id| Ok(Field::new(id, input_schema.try_get(id)?.clone()))) + .map(|id| Ok(Field::new(id.clone(), input_schema.try_get(id)?.clone()))) .collect::>()?; let variable_name = args .variable_name @@ -201,8 +199,8 @@ fn unpivot_schema<'a>( fn rename_schema<'a>( input_schema: &'a SchemaRef, - existing: &[SmartString], - new: &[SmartString], + existing: &[PlSmallStr], + new: &[PlSmallStr], cached_schema: &CachedSchema, ) -> PolarsResult> { let mut guard = cached_schema.lock().unwrap(); diff --git a/crates/polars-plan/src/plans/hive.rs b/crates/polars-plan/src/plans/hive.rs index a89c8a32a127..3fc7531ea2b3 100644 --- a/crates/polars-plan/src/plans/hive.rs +++ b/crates/polars-plan/src/plans/hive.rs @@ -17,7 +17,7 @@ pub struct HivePartitions { impl HivePartitions { pub fn get_projection_schema_and_indices( &self, - names: &PlHashSet, + names: &PlHashSet, ) -> (SchemaRef, Vec) { let mut out_schema = Schema::with_capacity(self.stats.schema().len()); let mut out_indices = Vec::with_capacity(self.stats.column_stats().len()); @@ -114,7 +114,7 @@ pub fn hive_partitions_from_paths( dtype.clone() }; - Ok(Field::new(name, dtype)) + Ok(Field::new(PlSmallStr::from_str(name), dtype)) }).collect::>()?) } else { let mut hive_schema = Schema::with_capacity(16); diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index 8fb6dbe5444d..69e3a69733c5 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -2,6 +2,7 @@ use std::fmt; use std::path::PathBuf; use polars_core::schema::Schema; +use polars_utils::pl_str::PlSmallStr; use super::format::ExprIRSliceDisplay; use crate::constants::UNLIMITED_CACHE; @@ -342,7 +343,7 @@ impl<'a> IRDotDisplay<'a> { // A few utility structures for formatting pub struct PathsDisplay<'a>(pub &'a [PathBuf]); -struct NumColumns<'a>(Option<&'a [String]>); +struct NumColumns<'a>(Option<&'a [PlSmallStr]>); struct NumColumnsSchema<'a>(Option<&'a Schema>); struct OptionExprIRDisplay<'a>(Option>); diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index b8b0378419d6..443726affad0 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize}; use crate::prelude::*; +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub struct IRPlan { pub lp_top: Node, pub lp_arena: Arena, diff --git a/crates/polars-plan/src/plans/lit.rs b/crates/polars-plan/src/plans/lit.rs index c0dcab76d3c6..060bbf0fd460 100644 --- a/crates/polars-plan/src/plans/lit.rs +++ b/crates/polars-plan/src/plans/lit.rs @@ -8,7 +8,7 @@ use polars_utils::hashing::hash_to_partition; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use crate::constants::{get_literal_name, LITERAL_NAME}; +use crate::constants::get_literal_name; use crate::prelude::*; #[derive(Clone, PartialEq)] @@ -18,7 +18,7 @@ pub enum LiteralValue { /// A binary true or false. Boolean(bool), /// A UTF8 encoded string type. - String(String), + String(PlSmallStr), /// A raw binary array Binary(Vec), /// An unsigned 8-bit integer number. @@ -67,22 +67,22 @@ pub enum LiteralValue { // Used for dynamic languages Int(i128), // Dynamic string, still needs to be made concrete. - StrCat(String), + StrCat(PlSmallStr), } impl LiteralValue { /// Get the output name as `&str`. - pub(crate) fn output_name(&self) -> &str { + pub(crate) fn output_name(&self) -> &PlSmallStr { match self { LiteralValue::Series(s) => s.name(), - _ => LITERAL_NAME, + _ => get_literal_name(), } } - /// Get the output name as [`ColumnName`]. - pub(crate) fn output_column_name(&self) -> ColumnName { + /// Get the output name as [`PlSmallStr`]. + pub(crate) fn output_column_name(&self) -> &PlSmallStr { match self { - LiteralValue::Series(s) => ColumnName::from(s.name()), + LiteralValue::Series(s) => s.name(), _ => get_literal_name(), } } @@ -152,12 +152,14 @@ impl LiteralValue { let low = *low as i32; let high = *high as i32; - new_int_range::(low, high, 1, "range").ok() + new_int_range::(low, high, 1, PlSmallStr::from_static("range")) + .ok() }, DataType::Int64 => { let low = *low; let high = *high; - new_int_range::(low, high, 1, "range").ok() + new_int_range::(low, high, 1, PlSmallStr::from_static("range")) + .ok() }, DataType::UInt32 => { if *low < 0 || *high > u32::MAX as i64 { @@ -165,7 +167,8 @@ impl LiteralValue { } let low = *low as u32; let high = *high as u32; - new_int_range::(low, high, 1, "range").ok() + new_int_range::(low, high, 1, PlSmallStr::from_static("range")) + .ok() }, _ => return None, }; @@ -237,15 +240,21 @@ pub trait TypedLiteral: Literal { impl TypedLiteral for String {} impl TypedLiteral for &str {} -impl Literal for String { +impl Literal for PlSmallStr { fn lit(self) -> Expr { Expr::Literal(LiteralValue::String(self)) } } +impl Literal for String { + fn lit(self) -> Expr { + Expr::Literal(LiteralValue::String(PlSmallStr::from_string(self))) + } +} + impl<'a> Literal for &'a str { fn lit(self) -> Expr { - Expr::Literal(LiteralValue::String(self.to_string())) + Expr::Literal(LiteralValue::String(PlSmallStr::from_str(self))) } } @@ -267,7 +276,7 @@ impl TryFrom> for LiteralValue { match value { AnyValue::Null => Ok(Self::Null), AnyValue::Boolean(b) => Ok(Self::Boolean(b)), - AnyValue::String(s) => Ok(Self::String(s.to_string())), + AnyValue::String(s) => Ok(Self::String(PlSmallStr::from_str(s))), AnyValue::Binary(b) => Ok(Self::Binary(b.to_vec())), #[cfg(feature = "dtype-u8")] AnyValue::UInt8(u) => Ok(Self::UInt8(u)), @@ -294,16 +303,16 @@ impl TryFrom> for LiteralValue { #[cfg(feature = "dtype-time")] AnyValue::Time(v) => Ok(LiteralValue::Time(v)), AnyValue::List(l) => Ok(Self::Series(SpecialEq::new(l))), - AnyValue::StringOwned(o) => Ok(Self::String(o.into())), + AnyValue::StringOwned(o) => Ok(Self::String(o)), #[cfg(feature = "dtype-categorical")] AnyValue::Categorical(c, rev_mapping, arr) | AnyValue::Enum(c, rev_mapping, arr) => { if arr.is_null() { - Ok(Self::String(rev_mapping.get(c).to_string())) + Ok(Self::String(PlSmallStr::from_str(rev_mapping.get(c)))) } else { unsafe { - Ok(Self::String( - arr.deref_unchecked().value(c as usize).to_string(), - )) + Ok(Self::String(PlSmallStr::from_str( + arr.deref_unchecked().value(c as usize), + ))) } } }, diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs index 080c468454e7..d225683a0d3f 100644 --- a/crates/polars-plan/src/plans/mod.rs +++ b/crates/polars-plan/src/plans/mod.rs @@ -51,8 +51,6 @@ pub use schema::*; use serde::{Deserialize, Serialize}; use strum_macros::IntoStaticStr; -pub type ColumnName = Arc; - #[derive(Clone, Copy, Debug)] pub enum Context { /// Any operation that is done on groups diff --git a/crates/polars-plan/src/plans/optimizer/cache_states.rs b/crates/polars-plan/src/plans/optimizer/cache_states.rs index b66f73a18ae8..400abbb12ddd 100644 --- a/crates/polars-plan/src/plans/optimizer/cache_states.rs +++ b/crates/polars-plan/src/plans/optimizer/cache_states.rs @@ -6,7 +6,7 @@ fn get_upper_projections( parent: Node, lp_arena: &Arena, expr_arena: &Arena, - names_scratch: &mut Vec, + names_scratch: &mut Vec, found_required_columns: &mut bool, ) -> bool { let parent = lp_arena.get(parent); @@ -15,7 +15,7 @@ fn get_upper_projections( // During projection pushdown all accumulated. match parent { SimpleProjection { columns, .. } => { - let iter = columns.iter_names().map(|s| ColumnName::from(s.as_str())); + let iter = columns.iter_names().cloned(); names_scratch.extend(iter); *found_required_columns = true; false @@ -138,7 +138,7 @@ pub(super) fn set_cache_states( parents: Vec, cache_nodes: Vec, // Union over projected names. - names_union: PlHashSet, + names_union: PlHashSet, // Union over predicates. predicate_union: PlHashMap, } @@ -264,11 +264,7 @@ pub(super) fn set_cache_states( // all columns if !found_required_columns { let schema = lp.schema(lp_arena); - v.names_union.extend( - schema - .iter_names() - .map(|name| ColumnName::from(name.as_str())), - ); + v.names_union.extend(schema.iter_names().cloned()); } } frame.cache_id = Some(*id); diff --git a/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs b/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs index 160a9cbdbc0f..b3f52c6e30a9 100644 --- a/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs +++ b/crates/polars-plan/src/plans/optimizer/cluster_with_columns.rs @@ -8,9 +8,9 @@ use polars_utils::vec::inplace_zip_filtermap; use super::aexpr::AExpr; use super::ir::IR; -use super::{aexpr_to_leaf_names_iter, ColumnName}; +use super::{aexpr_to_leaf_names_iter, PlSmallStr}; -type ColumnMap = PlHashMap; +type ColumnMap = PlHashMap; fn column_map_finalize_bitset(bitset: &mut MutableBitmap, column_map: &ColumnMap) { assert!(bitset.len() <= column_map.len()); @@ -19,7 +19,7 @@ fn column_map_finalize_bitset(bitset: &mut MutableBitmap, column_map: &ColumnMap bitset.extend_constant(column_map.len() - size, false); } -fn column_map_set(bitset: &mut MutableBitmap, column_map: &mut ColumnMap, column: ColumnName) { +fn column_map_set(bitset: &mut MutableBitmap, column_map: &mut ColumnMap, column: PlSmallStr) { let size = column_map.len(); column_map .entry(column) @@ -92,7 +92,7 @@ pub fn optimize(root: Node, lp_arena: &mut Arena, expr_arena: &Arena) column_map_set( &mut input_genset, column_map, - input_expr.output_name_arc().clone(), + input_expr.output_name().clone(), ); } @@ -132,14 +132,12 @@ pub fn optimize(root: Node, lp_arena: &mut Arena, expr_arena: &Arena) return Some((expr, liveset)); } - let column_name = expr.output_name_arc(); + let column_name = expr.output_name(); let is_pushable = if let Some(idx) = column_map.get(column_name) { let does_input_alias_also_expr = input_genset.get(*idx); let is_alias_live_in_current = current_liveset.get(*idx); if does_input_alias_also_expr && !is_alias_live_in_current { - let column_name = column_name.as_ref(); - // @NOTE: Pruning of re-assigned columns // // We checked if this expression output is also assigned by the input and @@ -190,7 +188,7 @@ pub fn optimize(root: Node, lp_arena: &mut Arena, expr_arena: &Arena) // This will pushdown the expressions that "has an output column that is mentioned by // neighbour columns, but all those neighbours were being pushed down". for candidate in potential_pushable.iter().copied() { - let column_name = current_exprs[candidate].output_name_arc(); + let column_name = current_exprs[candidate].output_name(); let column_idx = column_map.get(column_name).unwrap(); current_liveset.clear(); @@ -258,7 +256,7 @@ pub fn optimize(root: Node, lp_arena: &mut Arena, expr_arena: &Arena) if do_pushdown { needs_simple_projection = has_seen_unpushable; - let column = expr.output_name_arc().as_ref(); + let column = expr.output_name().as_ref(); // @NOTE: we cannot just use the index here, as there might be renames that sit // earlier in the schema let datatype = current_schema.get(column).unwrap(); diff --git a/crates/polars-plan/src/plans/optimizer/collapse_and_project.rs b/crates/polars-plan/src/plans/optimizer/collapse_and_project.rs index e4c0ac87151a..d79e8917591b 100644 --- a/crates/polars-plan/src/plans/optimizer/collapse_and_project.rs +++ b/crates/polars-plan/src/plans/optimizer/collapse_and_project.rs @@ -52,7 +52,7 @@ impl OptimizationRule for SimpleProjectionAndCollapse { let exprs = expr .iter() - .map(|e| e.output_name_arc().clone()) + .map(|e| e.output_name().clone()) .collect::>(); let alp = IRBuilder::new(*input, expr_arena, lp_arena) .project_simple(exprs.iter().map(|s| s.as_ref())) diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index d1ae618c0a2b..32a95cc3ede3 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -53,7 +53,7 @@ struct CountStarExpr { // File Type scan_type: FileScan, // Column Alias - alias: Option>, + alias: Option, } // Visit the logical plan and return CountStarExpr with the expr information gathered @@ -125,7 +125,7 @@ fn visit_logical_plan_for_scan_paths( } } -fn is_valid_count_expr(e: &ExprIR, expr_arena: &Arena) -> (bool, Option>) { +fn is_valid_count_expr(e: &ExprIR, expr_arena: &Arena) -> (bool, Option) { match expr_arena.get(e.node()) { AExpr::Len => (true, e.get_alias().cloned()), _ => (false, None), diff --git a/crates/polars-plan/src/plans/optimizer/cse/cse_expr.rs b/crates/polars-plan/src/plans/optimizer/cse/cse_expr.rs index fe6f94b9e19e..6b7763760fa1 100644 --- a/crates/polars-plan/src/plans/optimizer/cse/cse_expr.rs +++ b/crates/polars-plan/src/plans/optimizer/cse/cse_expr.rs @@ -1,4 +1,5 @@ use hashbrown::hash_map::RawEntryMut; +use polars_utils::format_pl_smallstr; use polars_utils::vec::CapacityByFactor; use super::*; @@ -74,8 +75,8 @@ impl Identifier { self.inner.is_some() } - fn materialize(&self) -> String { - format!("{}{:#x}", CSE_REPLACED, self.materialized_hash()) + fn materialize(&self) -> PlSmallStr { + format_pl_smallstr!("{}{:#x}", CSE_REPLACED, self.materialized_hash()) } fn materialized_hash(&self) -> u64 { @@ -590,7 +591,7 @@ impl RewritingVisitor for CommonSubExprRewriter<'_> { ); let name = id.materialize(); - node.assign(AExpr::col(name.as_ref()), arena); + node.assign(AExpr::col(name), arena); self.rewritten = true; Ok(node) @@ -723,7 +724,7 @@ impl CommonSubExprOptimizer { // intermediate temporary names starting with the `CSE_REPLACED` constant. if !e.has_alias() { let name = ae_node.to_field(schema, expr_arena)?.name; - out_e.set_alias(ColumnName::from(name.as_str())); + out_e.set_alias(name.clone()); } out_e }; @@ -733,7 +734,7 @@ impl CommonSubExprOptimizer { for id in self.replaced_identifiers.inner.keys() { let (node, _count) = self.se_count.get(id, expr_arena).unwrap(); let name = id.materialize(); - let out_e = ExprIR::new(*node, OutputName::Alias(ColumnName::from(name))); + let out_e = ExprIR::new(*node, OutputName::Alias(name)); new_expr.push(out_e) } let expr = diff --git a/crates/polars-plan/src/plans/optimizer/fused.rs b/crates/polars-plan/src/plans/optimizer/fused.rs index d548147f65ce..cb84ca1b385f 100644 --- a/crates/polars-plan/src/plans/optimizer/fused.rs +++ b/crates/polars-plan/src/plans/optimizer/fused.rs @@ -106,10 +106,7 @@ impl OptimizationRule for FusedArithmetic { let node = expr_arena.add(fma); // we reordered the arguments, so we don't obey the left expression output name // rule anymore, that's why we alias - Ok(Some(Alias( - node, - ColumnName::from(output_field.name.as_str()), - ))) + Ok(Some(Alias(node, output_field.name.clone()))) }, _ => unreachable!(), }, diff --git a/crates/polars-plan/src/plans/optimizer/mod.rs b/crates/polars-plan/src/plans/optimizer/mod.rs index 49dacbf7e6b8..4215347f2e7d 100644 --- a/crates/polars-plan/src/plans/optimizer/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/mod.rs @@ -72,6 +72,13 @@ pub fn optimize( let opt = StackOptimizer {}; let mut rules: Vec> = Vec::with_capacity(8); + // Unset CSE + // This can be turned on again during ir-conversion. + #[allow(clippy::eq_op)] + #[cfg(feature = "cse")] + if opt_state.contains(OptFlags::EAGER) { + opt_state &= !(OptFlags::COMM_SUBEXPR_ELIM | OptFlags::COMM_SUBEXPR_ELIM); + } let mut lp_top = to_alp(logical_plan, expr_arena, lp_arena, &mut opt_state)?; // get toggle values @@ -87,10 +94,10 @@ pub fn optimize( // This keeps eager execution more snappy. let eager = opt_state.contains(OptFlags::EAGER); #[cfg(feature = "cse")] - let comm_subplan_elim = opt_state.contains(OptFlags::COMM_SUBPLAN_ELIM) && !eager; + let comm_subplan_elim = opt_state.contains(OptFlags::COMM_SUBPLAN_ELIM); #[cfg(feature = "cse")] - let comm_subexpr_elim = opt_state.contains(OptFlags::COMM_SUBEXPR_ELIM) && !eager; + let comm_subexpr_elim = opt_state.contains(OptFlags::COMM_SUBEXPR_ELIM); #[cfg(not(feature = "cse"))] let comm_subexpr_elim = false; diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/group_by.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/group_by.rs index 208f2dca0973..6c6d4460b29e 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/group_by.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/group_by.rs @@ -12,7 +12,7 @@ pub(super) fn process_group_by( maintain_order: bool, apply: Option>, options: Arc, - acc_predicates: PlHashMap, ExprIR>, + acc_predicates: PlHashMap, ) -> PolarsResult { use IR::*; diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs index c787336af375..65121fdca00e 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs @@ -8,7 +8,7 @@ struct LeftRight(T, T); fn should_block_join_specific( ae: &AExpr, how: &JoinType, - on_names: &PlHashSet>, + on_names: &PlHashSet, expr_arena: &Arena, schema_left: &Schema, schema_right: &Schema, @@ -130,7 +130,7 @@ pub(super) fn process_join( right_on: Vec, schema: SchemaRef, options: Arc, - acc_predicates: PlHashMap, ExprIR>, + acc_predicates: PlHashMap, ) -> PolarsResult { use IR::*; let schema_left = lp_arena.get(input_left).schema(lp_arena); diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/keys.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/keys.rs index a11e2a8f0093..08eb14d2feb4 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/keys.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/keys.rs @@ -2,29 +2,27 @@ use super::*; // an invisible ascii token we use as delimiter -const HIDDEN_DELIMITER: char = '\u{1D17A}'; +const HIDDEN_DELIMITER: &str = "\u{1D17A}"; /// Determine the hashmap key by combining all the leaf column names of a predicate -pub(super) fn predicate_to_key(predicate: Node, expr_arena: &Arena) -> Arc { +pub(super) fn predicate_to_key(predicate: Node, expr_arena: &Arena) -> PlSmallStr { let mut iter = aexpr_to_leaf_names_iter(predicate, expr_arena); if let Some(first) = iter.next() { if let Some(second) = iter.next() { let mut new = String::with_capacity(32 * iter.size_hint().0); new.push_str(&first); - new.push(HIDDEN_DELIMITER); + new.push_str(HIDDEN_DELIMITER); new.push_str(&second); for name in iter { - new.push(HIDDEN_DELIMITER); + new.push_str(HIDDEN_DELIMITER); new.push_str(&name); } - return Arc::from(new); + return PlSmallStr::from_string(new); } first } else { - let mut s = String::new(); - s.push(HIDDEN_DELIMITER); - Arc::from(s) + PlSmallStr::from_str(HIDDEN_DELIMITER) } } diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs index a8b922b6d726..1def3d375958 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs @@ -59,7 +59,7 @@ impl<'a> PredicatePushDown<'a> { fn pushdown_and_assign( &self, input: Node, - acc_predicates: PlHashMap, ExprIR>, + acc_predicates: PlHashMap, lp_arena: &mut Arena, expr_arena: &mut Arena, ) -> PolarsResult<()> { @@ -73,7 +73,7 @@ impl<'a> PredicatePushDown<'a> { fn pushdown_and_continue( &self, lp: IR, - mut acc_predicates: PlHashMap, ExprIR>, + mut acc_predicates: PlHashMap, lp_arena: &mut Arena, expr_arena: &mut Arena, has_projections: bool, @@ -188,7 +188,7 @@ impl<'a> PredicatePushDown<'a> { fn no_pushdown_restart_opt( &self, lp: IR, - acc_predicates: PlHashMap, ExprIR>, + acc_predicates: PlHashMap, lp_arena: &mut Arena, expr_arena: &mut Arena, ) -> PolarsResult { @@ -219,7 +219,7 @@ impl<'a> PredicatePushDown<'a> { fn no_pushdown( &self, lp: IR, - acc_predicates: PlHashMap, ExprIR>, + acc_predicates: PlHashMap, lp_arena: &mut Arena, expr_arena: &mut Arena, ) -> PolarsResult { @@ -243,7 +243,7 @@ impl<'a> PredicatePushDown<'a> { fn push_down( &self, lp: IR, - mut acc_predicates: PlHashMap, ExprIR>, + mut acc_predicates: PlHashMap, lp_arena: &mut Arena, expr_arena: &mut Arena, ) -> PolarsResult { @@ -262,7 +262,7 @@ impl<'a> PredicatePushDown<'a> { // // (2) can be pushed past (1) but they both have the same predicate // key name in the hashtable. - let tmp_key = Arc::::from(&*temporary_unique_key(&acc_predicates)); + let tmp_key = temporary_unique_key(&acc_predicates); acc_predicates.insert(tmp_key.clone(), predicate.clone()); let local_predicates = match pushdown_eligibility( @@ -454,12 +454,12 @@ impl<'a> PredicatePushDown<'a> { if let Some(ref subset) = options.subset { // Predicates on the subset can pass. let subset = subset.clone(); - let mut names_set = PlHashSet::<&str>::with_capacity(subset.len()); + let mut names_set = PlHashSet::::with_capacity(subset.len()); for name in subset.iter() { - names_set.insert(name.as_ref()); + names_set.insert(name.clone()); } - let condition = |name: Arc| !names_set.contains(name.as_ref()); + let condition = |name: &PlSmallStr| !names_set.contains(name.as_str()); let local_predicates = transfer_to_local_by_name(expr_arena, &mut acc_predicates, condition); @@ -511,8 +511,7 @@ impl<'a> PredicatePushDown<'a> { )) }, FunctionIR::Explode { columns, .. } => { - let condition = - |name: Arc| columns.iter().any(|s| s.as_ref() == &*name); + let condition = |name: &PlSmallStr| columns.iter().any(|s| s == name); // first columns that refer to the exploded columns should be done here let local_predicates = transfer_to_local_by_name( @@ -537,15 +536,20 @@ impl<'a> PredicatePushDown<'a> { }, #[cfg(feature = "pivot")] FunctionIR::Unpivot { args, .. } => { - let variable_name = args.variable_name.as_deref().unwrap_or("variable"); - let value_name = args.value_name.as_deref().unwrap_or("value"); + let variable_name = &args + .variable_name + .clone() + .unwrap_or_else(|| PlSmallStr::from_static("variable")); + let value_name = &args + .value_name + .clone() + .unwrap_or_else(|| PlSmallStr::from_static("value")); // predicates that will be done at this level - let condition = |name: Arc| { - let name = &*name; + let condition = |name: &PlSmallStr| { name == variable_name || name == value_name - || args.on.iter().any(|s| s.as_str() == name) + || args.on.iter().any(|s| s == name) }; let local_predicates = transfer_to_local_by_name( expr_arena, diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/rename.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/rename.rs index e094564f4ddc..d31372009d8d 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/rename.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/rename.rs @@ -1,11 +1,11 @@ -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use super::*; use crate::prelude::optimizer::predicate_pushdown::keys::{key_has_name, predicate_to_key}; fn remove_any_key_referencing_renamed( new: &str, - acc_predicates: &mut PlHashMap, ExprIR>, + acc_predicates: &mut PlHashMap, local_predicates: &mut Vec, ) { let mut move_to_local = vec![]; @@ -21,10 +21,10 @@ fn remove_any_key_referencing_renamed( } pub(super) fn process_rename( - acc_predicates: &mut PlHashMap, ExprIR>, + acc_predicates: &mut PlHashMap, expr_arena: &mut Arena, - existing: &[SmartString], - new: &[SmartString], + existing: &[PlSmallStr], + new: &[PlSmallStr], ) -> PolarsResult> { let mut local_predicates = vec![]; for (existing, new) in existing.iter().zip(new.iter()) { @@ -51,7 +51,7 @@ pub(super) fn process_rename( // This ensure the optimization is pushed down. if let Some(mut e) = acc_predicates.remove(new.as_str()) { let new_node = - rename_matching_aexpr_leaf_names(e.node(), expr_arena, new, existing); + rename_matching_aexpr_leaf_names(e.node(), expr_arena, new, existing.clone()); e.set_node(new_node); acc_predicates.insert(predicate_to_key(new_node, expr_arena), e); } else { diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/utils.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/utils.rs index d7480c463b7c..7f14f2269cfd 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/utils.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/utils.rs @@ -12,7 +12,7 @@ fn combine_by_and(left: Node, right: Node, arena: &mut Arena) -> Node { /// Don't overwrite predicates but combine them. pub(super) fn insert_and_combine_predicate( - acc_predicates: &mut PlHashMap, ExprIR>, + acc_predicates: &mut PlHashMap, predicate: &ExprIR, arena: &mut Arena, ) { @@ -27,7 +27,8 @@ pub(super) fn insert_and_combine_predicate( .or_insert_with(|| predicate.clone()); } -pub(super) fn temporary_unique_key(acc_predicates: &PlHashMap, ExprIR>) -> String { +pub(super) fn temporary_unique_key(acc_predicates: &PlHashMap) -> PlSmallStr { + // TODO: Don't heap allocate during construction. let mut out_key = '\u{1D17A}'.to_string(); let mut existing_keys = acc_predicates.keys(); @@ -35,7 +36,7 @@ pub(super) fn temporary_unique_key(acc_predicates: &PlHashMap, ExprIR>) out_key.push_str(existing_keys.next().unwrap()); } - out_key + PlSmallStr::from_string(out_key) } pub(super) fn combine_predicates(iter: I, arena: &mut Arena) -> ExprIR @@ -59,7 +60,7 @@ where } pub(super) fn predicate_at_scan( - acc_predicates: PlHashMap, ExprIR>, + acc_predicates: PlHashMap, predicate: Option, expr_arena: &mut Arena, ) -> Option { @@ -111,18 +112,18 @@ pub(super) fn predicate_is_sort_boundary(node: Node, expr_arena: &Arena) /// transferred to local. pub(super) fn transfer_to_local_by_name( expr_arena: &Arena, - acc_predicates: &mut PlHashMap, ExprIR>, + acc_predicates: &mut PlHashMap, mut condition: F, ) -> Vec where - F: FnMut(Arc) -> bool, + F: FnMut(&PlSmallStr) -> bool, { let mut remove_keys = Vec::with_capacity(acc_predicates.len()); for (key, predicate) in &*acc_predicates { let root_names = aexpr_to_leaf_names(predicate.node(), expr_arena); for name in root_names { - if condition(name) { + if condition(&name) { remove_keys.push(key.clone()); break; } @@ -210,7 +211,7 @@ fn check_and_extend_predicate_pd_nodes( fn get_maybe_aliased_projection_to_input_name_map( e: &ExprIR, expr_arena: &Arena, -) -> Option<(Arc, Arc)> { +) -> Option<(PlSmallStr, PlSmallStr)> { let ae = expr_arena.get(e.node()); match e.get_alias() { Some(alias) => match ae { @@ -227,27 +228,27 @@ fn get_maybe_aliased_projection_to_input_name_map( pub enum PushdownEligibility { Full, // Partial can happen when there are window exprs. - Partial { to_local: Vec> }, + Partial { to_local: Vec }, NoPushdown, } #[allow(clippy::type_complexity)] pub fn pushdown_eligibility( projection_nodes: &[ExprIR], - new_predicates: &[(Arc, ExprIR)], - acc_predicates: &PlHashMap, ExprIR>, + new_predicates: &[(PlSmallStr, ExprIR)], + acc_predicates: &PlHashMap, expr_arena: &mut Arena, -) -> PolarsResult<(PushdownEligibility, PlHashMap, Arc>)> { +) -> PolarsResult<(PushdownEligibility, PlHashMap)> { let mut ae_nodes_stack = Vec::::with_capacity(4); let mut alias_to_col_map = - optimizer::init_hashmap::, Arc>(Some(projection_nodes.len())); + optimizer::init_hashmap::(Some(projection_nodes.len())); let mut col_to_alias_map = alias_to_col_map.clone(); let mut modified_projection_columns = - PlHashSet::>::with_capacity(projection_nodes.len()); + PlHashSet::::with_capacity(projection_nodes.len()); let mut has_window = false; - let mut common_window_inputs = PlHashSet::>::new(); + let mut common_window_inputs = PlHashSet::::new(); // Important: Names inserted into any data structure by this function are // all non-aliased. @@ -255,7 +256,7 @@ pub fn pushdown_eligibility( let process_projection_or_predicate = |ae_nodes_stack: &mut Vec, has_window: &mut bool, - common_window_inputs: &mut PlHashSet>| { + common_window_inputs: &mut PlHashSet| { debug_assert_eq!(ae_nodes_stack.len(), 1); while let Some(node) = ae_nodes_stack.pop() { @@ -276,7 +277,7 @@ pub fn pushdown_eligibility( }; let mut partition_by_names = - PlHashSet::>::with_capacity(partition_by.len()); + PlHashSet::::with_capacity(partition_by.len()); for node in partition_by.iter() { // Only accept col() @@ -333,7 +334,7 @@ pub fn pushdown_eligibility( continue; } - modified_projection_columns.insert(e.output_name_arc().clone()); + modified_projection_columns.insert(e.output_name().clone()); debug_assert!(ae_nodes_stack.is_empty()); ae_nodes_stack.push(e.node()); @@ -349,7 +350,7 @@ pub fn pushdown_eligibility( if has_window && !col_to_alias_map.is_empty() { // Rename to aliased names. - let mut new = PlHashSet::>::with_capacity(2 * common_window_inputs.len()); + let mut new = PlHashSet::::with_capacity(2 * common_window_inputs.len()); for key in common_window_inputs.into_iter() { if let Some(aliased) = col_to_alias_map.get(&key) { @@ -392,7 +393,7 @@ pub fn pushdown_eligibility( } // Note: has_window is constant. - let can_use_column = |col: &Arc| { + let can_use_column = |col: &str| { if has_window { common_window_inputs.contains(col) } else { diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs index 08c0ddf15bd6..9b0dc58b8cf0 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs @@ -12,7 +12,7 @@ pub(super) fn process_functions( input: Node, function: FunctionIR, mut acc_projections: Vec, - mut projected_names: PlHashSet>, + mut projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -52,7 +52,12 @@ pub(super) fn process_functions( }, Explode { columns, .. } => { columns.iter().for_each(|name| { - add_str_to_accumulated(name, &mut acc_projections, &mut projected_names, expr_arena) + add_str_to_accumulated( + name.clone(), + &mut acc_projections, + &mut projected_names, + expr_arena, + ) }); proj_pd.pushdown_and_assign( input, diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/unpivot.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/unpivot.rs index 56d9253ba2f3..518c8e081c5e 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/unpivot.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/unpivot.rs @@ -29,10 +29,20 @@ pub(super) fn process_unpivot( // make sure that the requested columns are projected args.index.iter().for_each(|name| { - add_str_to_accumulated(name, &mut acc_projections, &mut projected_names, expr_arena) + add_str_to_accumulated( + name.clone(), + &mut acc_projections, + &mut projected_names, + expr_arena, + ) }); args.on.iter().for_each(|name| { - add_str_to_accumulated(name, &mut acc_projections, &mut projected_names, expr_arena) + add_str_to_accumulated( + name.clone(), + &mut acc_projections, + &mut projected_names, + expr_arena, + ) }); proj_pd.pushdown_and_assign( diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/generic.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/generic.rs index e1326864a283..ee9a60738f22 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/generic.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/generic.rs @@ -5,7 +5,7 @@ pub(super) fn process_generic( proj_pd: &mut ProjectionPushDown, lp: IR, acc_projections: Vec, - projected_names: PlHashSet>, + projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/group_by.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/group_by.rs index 1ed124ef79ea..1dc1abcc3ba8 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/group_by.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/group_by.rs @@ -11,7 +11,7 @@ pub(super) fn process_group_by( maintain_order: bool, options: Arc, acc_projections: Vec, - projected_names: PlHashSet>, + projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -49,7 +49,7 @@ pub(super) fn process_group_by( .into_iter() .filter(|agg| { if has_pushed_down && projections_seen > 0 { - projected_names.contains(agg.output_name_arc()) + projected_names.contains(agg.output_name()) } else { true } @@ -68,17 +68,13 @@ pub(super) fn process_group_by( // make sure that the dynamic key is projected #[cfg(feature = "dynamic_group_by")] if let Some(options) = &options.dynamic { - let node = expr_arena.add(AExpr::Column(ColumnName::from( - options.index_column.as_str(), - ))); + let node = expr_arena.add(AExpr::Column(options.index_column.clone())); add_expr_to_accumulated(node, &mut acc_projections, &mut names, expr_arena); } // make sure that the rolling key is projected #[cfg(feature = "dynamic_group_by")] if let Some(options) = &options.rolling { - let node = expr_arena.add(AExpr::Column(ColumnName::from( - options.index_column.as_str(), - ))); + let node = expr_arena.add(AExpr::Column(options.index_column.clone())); add_expr_to_accumulated(node, &mut acc_projections, &mut names, expr_arena); } diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/hstack.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/hstack.rs index 628741511d86..8096b5bde3d8 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/hstack.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/hstack.rs @@ -7,7 +7,7 @@ pub(super) fn process_hstack( mut exprs: Vec, options: ProjectionOptions, mut acc_projections: Vec, - mut projected_names: PlHashSet>, + mut projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs index 007ca07cf206..116486e65a4d 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs @@ -8,11 +8,11 @@ fn add_keys_to_accumulated_state( expr: Node, acc_projections: &mut Vec, local_projection: &mut Vec, - projected_names: &mut PlHashSet>, + projected_names: &mut PlHashSet, expr_arena: &mut Arena, // only for left hand side table we add local names add_local: bool, -) -> Option> { +) -> Option { add_expr_to_accumulated(expr, acc_projections, projected_names, expr_arena); // the projections may do more than simply project. // e.g. col("foo").truncate() * col("bar") @@ -43,7 +43,7 @@ pub(super) fn process_asof_join( right_on: Vec, options: Arc, acc_projections: Vec, - _projected_names: PlHashSet>, + _projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -78,7 +78,7 @@ pub(super) fn process_asof_join( for name in left_by { let add = _projected_names.contains(name.as_str()); - let node = expr_arena.add(AExpr::Column(ColumnName::from(name.as_str()))); + let node = expr_arena.add(AExpr::Column(name.clone())); add_keys_to_accumulated_state( node, &mut pushdown_left, @@ -89,7 +89,7 @@ pub(super) fn process_asof_join( ); } for name in right_by { - let node = expr_arena.add(AExpr::Column(ColumnName::from(name.as_str()))); + let node = expr_arena.add(AExpr::Column(name.clone())); add_keys_to_accumulated_state( node, &mut pushdown_right, @@ -202,7 +202,7 @@ pub(super) fn process_join( right_on: Vec, mut options: Arc, acc_projections: Vec, - projected_names: PlHashSet>, + projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -252,7 +252,7 @@ pub(super) fn process_join( // We need the join columns so we push the projection downwards for e in &left_on { - if !local_projected_names.insert(e.output_name_arc().clone()) { + if !local_projected_names.insert(e.output_name().clone()) { continue; } @@ -384,8 +384,8 @@ fn process_projection( proj: ColumnNode, pushdown_left: &mut Vec, pushdown_right: &mut Vec, - names_left: &mut PlHashSet>, - names_right: &mut PlHashSet>, + names_left: &mut PlHashSet, + names_right: &mut PlHashSet, expr_arena: &mut Arena, local_projection: &mut Vec, add_local: bool, @@ -416,16 +416,17 @@ fn process_projection( // Column name of the projection without any alias. let leaf_column_name = column_node_to_name(proj, expr_arena).clone(); - let suffix = options.args.suffix(); + let suffix = options.args.suffix().as_str(); // If _right suffix exists we need to push a projection down without this // suffix. if leaf_column_name.ends_with(suffix) && join_schema.contains(leaf_column_name.as_ref()) { // downwards name is the name without the _right i.e. "foo". let downwards_name = split_suffix(leaf_column_name.as_ref(), suffix); + let downwards_name = PlSmallStr::from_str(downwards_name); - let downwards_name_column = expr_arena.add(AExpr::Column(Arc::from(downwards_name))); + let downwards_name_column = expr_arena.add(AExpr::Column(downwards_name.clone())); // project downwards and locally immediately alias to prevent wrong projections - if names_right.insert(ColumnName::from(downwards_name)) { + if names_right.insert(downwards_name) { pushdown_right.push(ColumnNode(downwards_name_column)); } local_projection.push(proj); @@ -470,7 +471,7 @@ fn resolve_join_suffixes( expr_arena: &mut Arena, local_projection: &[ColumnNode], ) -> PolarsResult { - let suffix = options.args.suffix(); + let suffix = options.args.suffix().as_str(); let alp = IRBuilder::new(input_left, expr_arena, lp_arena) .join(input_right, left_on, right_on, options.clone()) .build(); @@ -483,7 +484,7 @@ fn resolve_join_suffixes( let name = column_node_to_name(*proj, expr_arena).clone(); if name.ends_with(suffix) && schema_after_join.get(&name).is_none() { let downstream_name = &name.as_ref()[..name.len() - suffix.len()]; - let col = AExpr::Column(ColumnName::from(downstream_name)); + let col = AExpr::Column(downstream_name.into()); let node = expr_arena.add(col); all_columns = false; ExprIR::new(node, OutputName::Alias(name.clone())) @@ -496,7 +497,7 @@ fn resolve_join_suffixes( let builder = IRBuilder::from_lp(alp, expr_arena, lp_arena); Ok(if all_columns { builder - .project_simple(projections.iter().map(|e| e.output_name()))? + .project_simple(projections.iter().map(|e| e.output_name().clone()))? .build() } else { builder.project(projections, Default::default()).build() diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs index 0455ba7f5e9c..e5e2fb94ccde 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs @@ -29,41 +29,43 @@ use crate::utils::aexpr_to_leaf_names; fn init_vec() -> Vec { Vec::with_capacity(16) } -fn init_set() -> PlHashSet> { +fn init_set() -> PlHashSet { PlHashSet::with_capacity(32) } /// utility function to get names of the columns needed in projection at scan level fn get_scan_columns( - acc_projections: &Vec, + acc_projections: &[ColumnNode], expr_arena: &Arena, row_index: Option<&RowIndex>, file_path_col: Option<&str>, -) -> Option> { - let mut with_columns = None; +) -> Option> { if !acc_projections.is_empty() { - let mut columns = Vec::with_capacity(acc_projections.len()); - for expr in acc_projections { - let name = column_node_to_name(*expr, expr_arena); - // we shouldn't project the row-count column, as that is generated - // in the scan - if let Some(ri) = row_index { - if ri.name.as_ref() == name.as_ref() { - continue; - } - } + Some( + acc_projections + .iter() + .filter_map(|node| { + let name = column_node_to_name(*node, expr_arena); + + if let Some(ri) = row_index { + if ri.name == name { + return None; + } + } - if let Some(file_path_col) = file_path_col { - if file_path_col == name.as_ref() { - continue; - } - } + if let Some(file_path_col) = file_path_col { + if file_path_col == name.as_str() { + return None; + } + } - columns.push((**name).to_owned()) - } - with_columns = Some(Arc::from(columns)); + Some(name.clone()) + }) + .collect::>(), + ) + } else { + None } - with_columns } /// split in a projection vec that can be pushed down and a projection vec that should be used @@ -78,7 +80,7 @@ fn split_acc_projections( down_schema: &Schema, expr_arena: &Arena, expands_schema: bool, -) -> (Vec, Vec, PlHashSet>) { +) -> (Vec, Vec, PlHashSet) { // If node above has as many columns as the projection there is nothing to pushdown. if !expands_schema && down_schema.len() == acc_projections.len() { let local_projections = acc_projections; @@ -100,7 +102,7 @@ fn split_acc_projections( fn add_expr_to_accumulated( expr: Node, acc_projections: &mut Vec, - projected_names: &mut PlHashSet>, + projected_names: &mut PlHashSet, expr_arena: &Arena, ) { for root_node in aexpr_to_column_nodes_iter(expr, expr_arena) { @@ -112,14 +114,14 @@ fn add_expr_to_accumulated( } fn add_str_to_accumulated( - name: &str, + name: PlSmallStr, acc_projections: &mut Vec, - projected_names: &mut PlHashSet>, + projected_names: &mut PlHashSet, expr_arena: &mut Arena, ) { // if empty: all columns are already projected. - if !acc_projections.is_empty() && !projected_names.contains(name) { - let node = expr_arena.add(AExpr::Column(ColumnName::from(name))); + if !acc_projections.is_empty() && !projected_names.contains(&name) { + let node = expr_arena.add(AExpr::Column(name)); add_expr_to_accumulated(node, acc_projections, projected_names, expr_arena); } } @@ -225,8 +227,8 @@ impl ProjectionPushDown { proj: ColumnNode, pushdown_left: &mut Vec, pushdown_right: &mut Vec, - names_left: &mut PlHashSet>, - names_right: &mut PlHashSet>, + names_left: &mut PlHashSet, + names_right: &mut PlHashSet, expr_arena: &Arena, ) -> (bool, bool) { let mut pushed_at_least_one = false; @@ -257,7 +259,7 @@ impl ProjectionPushDown { &mut self, input: Node, acc_projections: Vec, - names: PlHashSet>, + names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -323,7 +325,7 @@ impl ProjectionPushDown { &mut self, logical_plan: IR, mut acc_projections: Vec, - mut projected_names: PlHashSet>, + mut projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -344,7 +346,7 @@ impl ProjectionPushDown { expr_arena, ), SimpleProjection { columns, input, .. } => { - let exprs = names_to_expr_irs(columns.iter_names(), expr_arena); + let exprs = names_to_expr_irs(columns.iter_names().cloned(), expr_arena); process_projection( self, input, @@ -563,7 +565,7 @@ impl ProjectionPushDown { if let Some(subset) = options.subset.as_ref() { subset.iter().for_each(|name| { add_str_to_accumulated( - name, + name.clone(), &mut acc_projections, &mut projected_names, expr_arena, @@ -574,7 +576,7 @@ impl ProjectionPushDown { let input_schema = lp_arena.get(input).schema(lp_arena); for name in input_schema.iter_names() { add_str_to_accumulated( - name.as_str(), + name.clone(), &mut acc_projections, &mut projected_names, expr_arena, diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs index 4fda3a2432bc..3e25b6a86841 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/projection.rs @@ -18,7 +18,7 @@ fn check_double_projection( expr: &ExprIR, expr_arena: &mut Arena, acc_projections: &mut Vec, - projected_names: &mut PlHashSet>, + projected_names: &mut PlHashSet, ) { // Factor out the pruning function fn prune_projections_by_name( @@ -50,7 +50,7 @@ pub(super) fn process_projection( input: Node, mut exprs: Vec, mut acc_projections: Vec, - mut projected_names: PlHashSet>, + mut projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, @@ -70,7 +70,7 @@ pub(super) fn process_projection( // simply select the last column // NOTE: the first can be the inserted index column, so that might not work let (first_name, _) = input_schema.try_get_at_index(input_schema.len() - 1)?; - let expr = expr_arena.add(AExpr::Column(ColumnName::from(first_name.as_str()))); + let expr = expr_arena.add(AExpr::Column(first_name.clone())); if !acc_projections.is_empty() { check_double_projection( &exprs[0], @@ -97,7 +97,7 @@ pub(super) fn process_projection( for e in exprs { if has_pushed_down { // remove projections that are not used upstream - if !projected_names.contains(e.output_name_arc()) { + if !projected_names.contains(e.output_name()) { continue; } diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs index 3f0a39d05a7b..b5ae3717f8c1 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs @@ -1,6 +1,6 @@ use std::collections::BTreeSet; -use smartstring::alias::String as SmartString; +use polars_utils::pl_str::PlSmallStr; use super::*; @@ -16,7 +16,7 @@ fn iter_and_update_nodes( if !processed.contains(&node.0) { // We walk the query backwards, so we rename new to existing if column_node_to_name(*column_node, expr_arena).as_ref() == new { - let new_node = expr_arena.add(AExpr::Column(ColumnName::from(existing))); + let new_node = expr_arena.add(AExpr::Column(PlSmallStr::from_str(existing))); *column_node = ColumnNode(new_node); processed.insert(new_node.0); } @@ -27,28 +27,24 @@ fn iter_and_update_nodes( #[allow(clippy::too_many_arguments)] pub(super) fn process_rename( acc_projections: &mut [ColumnNode], - projected_names: &mut PlHashSet>, + projected_names: &mut PlHashSet, expr_arena: &mut Arena, - existing: &[SmartString], - new: &[SmartString], + existing: &[PlSmallStr], + new: &[PlSmallStr], swapping: bool, ) -> PolarsResult<()> { if swapping { - let reverse_map: PlHashMap<_, _> = new - .iter() - .map(|s| s.as_str()) - .zip(existing.iter().map(|s| s.as_str())) - .collect(); + let reverse_map: PlHashMap<_, _> = + new.iter().cloned().zip(existing.iter().cloned()).collect(); let mut new_projected_names = PlHashSet::with_capacity(projected_names.len()); for col in acc_projections { let name = column_node_to_name(*col, expr_arena); if let Some(previous) = reverse_map.get(name.as_ref()) { - let previous: Arc = Arc::from(*previous); let new = expr_arena.add(AExpr::Column(previous.clone())); *col = ColumnNode(new); - let _ = new_projected_names.insert(previous); + let _ = new_projected_names.insert(previous.clone()); } else { let _ = new_projected_names.insert(name.clone()); } @@ -58,7 +54,7 @@ pub(super) fn process_rename( let mut processed = BTreeSet::new(); for (existing, new) in existing.iter().zip(new.iter()) { if projected_names.remove(new.as_str()) { - let name: Arc = ColumnName::from(existing.as_str()); + let name = existing.clone(); projected_names.insert(name); iter_and_update_nodes(existing, new, acc_projections, expr_arena, &mut processed); } diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/semi_anti_join.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/semi_anti_join.rs index 6b0863fa11cc..2cdb1edab260 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/semi_anti_join.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/semi_anti_join.rs @@ -9,7 +9,7 @@ pub(super) fn process_semi_anti_join( right_on: Vec, options: Arc, acc_projections: Vec, - _projected_names: PlHashSet>, + _projected_names: PlHashSet, projections_seen: usize, lp_arena: &mut Arena, expr_arena: &mut Arena, diff --git a/crates/polars-plan/src/plans/optimizer/simplify_expr.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr.rs index 2a8edcf42024..db03f516506c 100644 --- a/crates/polars-plan/src/plans/optimizer/simplify_expr.rs +++ b/crates/polars-plan/src/plans/optimizer/simplify_expr.rs @@ -407,7 +407,7 @@ fn string_addition_to_linear_concat( _ => Some(AExpr::Function { input: vec![left_e, right_e], function: StringFunction::ConcatHorizontal { - delimiter: "".to_string(), + delimiter: "".into(), ignore_nulls: false, } .into(), diff --git a/crates/polars-plan/src/plans/options.rs b/crates/polars-plan/src/plans/options.rs index 85506b7f6a15..078acbae7177 100644 --- a/crates/polars-plan/src/plans/options.rs +++ b/crates/polars-plan/src/plans/options.rs @@ -20,7 +20,7 @@ use polars_time::{DynamicGroupOptions, RollingGroupOptions}; use serde::{Deserialize, Serialize}; use crate::dsl::Selector; -use crate::plans::{ColumnName, ExprIR}; +use crate::plans::{ExprIR, PlSmallStr}; #[cfg(feature = "python")] use crate::prelude::python_udf::PythonFunction; @@ -31,14 +31,14 @@ pub type FileCount = u32; /// Generic options for all file types. pub struct FileScanOptions { pub slice: Option<(i64, usize)>, - pub with_columns: Option>, + pub with_columns: Option>, pub cache: bool, pub row_index: Option, pub rechunk: bool, pub file_counter: FileCount, pub hive_options: HiveOptions, pub glob: bool, - pub include_file_paths: Option>, + pub include_file_paths: Option, } #[derive(Clone, Debug, Copy, Default, Eq, PartialEq, Hash)] @@ -88,7 +88,7 @@ pub struct DistinctOptionsDSL { #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub struct DistinctOptionsIR { /// Subset of columns that will be taken into account. - pub subset: Option>, + pub subset: Option>, /// This will maintain the order of the input. /// Note that this is more expensive. /// `maintain_order` is not supported in the streaming @@ -257,7 +257,7 @@ pub struct PythonOptions { /// Schema the reader will produce when the file is read. pub output_schema: Option, // Projected column names. - pub with_columns: Option>, + pub with_columns: Option>, // Which interface is the python function. pub python_source: PythonScanSource, /// Optional predicate the reader must apply. diff --git a/crates/polars-plan/src/plans/schema.rs b/crates/polars-plan/src/plans/schema.rs index 3fa552e8a2ae..81ea7d92df96 100644 --- a/crates/polars-plan/src/plans/schema.rs +++ b/crates/polars-plan/src/plans/schema.rs @@ -4,7 +4,7 @@ use std::sync::Mutex; use arrow::datatypes::ArrowSchemaRef; use either::Either; use polars_core::prelude::*; -use polars_utils::format_smartstring; +use polars_utils::format_pl_smallstr; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -286,7 +286,7 @@ pub(crate) fn det_join_schema( { let left_is_removed = join_on_left.contains(name.as_str()) && should_coalesce; if schema_left.contains(name.as_str()) && !left_is_removed { - let new_name = format_smartstring!("{}{}", name, options.args.suffix()); + let new_name = format_pl_smallstr!("{}{}", name, options.args.suffix()); new_schema.with_column(new_name, dtype.clone()); } else { new_schema.with_column(name.clone(), dtype.clone()); @@ -319,7 +319,7 @@ pub(crate) fn det_join_schema( if should_coalesce && field_left.name != field_right.name { if schema_left.contains(&field_right.name) { new_schema.with_column( - _join_suffix_name(&field_right.name, options.args.suffix()).into(), + _join_suffix_name(&field_right.name, options.args.suffix()), field_right.dtype, ); } else { @@ -351,7 +351,7 @@ pub(crate) fn det_join_schema( // The names that are joined on are merged if schema_left.contains(name.as_str()) { - let new_name = format_smartstring!("{}{}", name, options.args.suffix()); + let new_name = format_pl_smallstr!("{}{}", name, options.args.suffix()); new_schema.with_column(new_name, dtype.clone()); } else { new_schema.with_column(name.clone(), dtype.clone()); diff --git a/crates/polars-plan/src/utils.rs b/crates/polars-plan/src/utils.rs index 25f3b2ff5cc6..20ce08021de0 100644 --- a/crates/polars-plan/src/utils.rs +++ b/crates/polars-plan/src/utils.rs @@ -3,16 +3,18 @@ use std::iter::FlatMap; use polars_core::prelude::*; use polars_utils::idx_vec::UnitVec; -use smartstring::alias::String as SmartString; -use crate::constants::{get_len_name, LEN}; +use crate::constants::get_len_name; use crate::prelude::*; /// Utility to write comma delimited strings -pub fn comma_delimited(mut s: String, items: &[SmartString]) -> String { +pub fn comma_delimited(mut s: String, items: &[S]) -> String +where + S: AsRef, +{ s.push('('); for c in items { - s.push_str(c); + s.push_str(c.as_ref()); s.push_str(", "); } s.pop(); @@ -135,7 +137,7 @@ pub fn has_null(current_expr: &Expr) -> bool { }) } -pub fn aexpr_output_name(node: Node, arena: &Arena) -> PolarsResult> { +pub fn aexpr_output_name(node: Node, arena: &Arena) -> PolarsResult { for (_, ae) in arena.iter(node) { match ae { // don't follow the partition by branch @@ -143,7 +145,7 @@ pub fn aexpr_output_name(node: Node, arena: &Arena) -> PolarsResult return Ok(name.clone()), AExpr::Alias(_, name) => return Ok(name.clone()), AExpr::Len => return Ok(get_len_name()), - AExpr::Literal(val) => return Ok(val.output_column_name()), + AExpr::Literal(val) => return Ok(val.output_column_name().clone()), _ => {}, } } @@ -155,7 +157,7 @@ pub fn aexpr_output_name(node: Node, arena: &Arena) -> PolarsResult PolarsResult> { +pub fn expr_output_name(expr: &Expr) -> PolarsResult { for e in expr { match e { // don't follow the partition by branch @@ -171,7 +173,7 @@ pub fn expr_output_name(expr: &Expr) -> PolarsResult> { "this expression may produce multiple output names" ), Expr::Len => return Ok(get_len_name()), - Expr::Literal(val) => return Ok(val.output_column_name()), + Expr::Literal(val) => return Ok(val.output_column_name().clone()), _ => {}, } } @@ -183,7 +185,7 @@ pub fn expr_output_name(expr: &Expr) -> PolarsResult> { /// This function should be used to find the name of the start of an expression /// Normal iteration would just return the first root column it found -pub(crate) fn get_single_leaf(expr: &Expr) -> PolarsResult> { +pub(crate) fn get_single_leaf(expr: &Expr) -> PolarsResult { for e in expr { match e { Expr::Filter { input, .. } => return get_single_leaf(input), @@ -191,7 +193,7 @@ pub(crate) fn get_single_leaf(expr: &Expr) -> PolarsResult> { Expr::SortBy { expr, .. } => return get_single_leaf(expr), Expr::Window { function, .. } => return get_single_leaf(function), Expr::Column(name) => return Ok(name.clone()), - Expr::Len => return Ok(ColumnName::from(LEN)), + Expr::Len => return Ok(get_len_name()), _ => {}, } } @@ -201,17 +203,17 @@ pub(crate) fn get_single_leaf(expr: &Expr) -> PolarsResult> { } #[allow(clippy::type_complexity)] -pub fn expr_to_leaf_column_names_iter(expr: &Expr) -> impl Iterator> + '_ { +pub fn expr_to_leaf_column_names_iter(expr: &Expr) -> impl Iterator + '_ { expr_to_leaf_column_exprs_iter(expr).flat_map(|e| expr_to_leaf_column_name(e).ok()) } /// This should gradually replace expr_to_root_column as this will get all names in the tree. -pub fn expr_to_leaf_column_names(expr: &Expr) -> Vec> { +pub fn expr_to_leaf_column_names(expr: &Expr) -> Vec { expr_to_leaf_column_names_iter(expr).collect() } /// unpack alias(col) to name of the root column name -pub fn expr_to_leaf_column_name(expr: &Expr) -> PolarsResult> { +pub fn expr_to_leaf_column_name(expr: &Expr) -> PolarsResult { let mut leaves = expr_to_leaf_column_exprs_iter(expr).collect::>(); polars_ensure!(leaves.len() <= 1, ComputeError: "found more than one root column name"); match leaves.pop() { @@ -240,7 +242,7 @@ pub(crate) fn aexpr_to_column_nodes_iter<'a>( }) } -pub fn column_node_to_name(node: ColumnNode, arena: &Arena) -> &Arc { +pub fn column_node_to_name(node: ColumnNode, arena: &Arena) -> &PlSmallStr { if let AExpr::Column(name) = arena.get(node.0) { name } else { @@ -254,7 +256,7 @@ pub(crate) fn rename_matching_aexpr_leaf_names( node: Node, arena: &mut Arena, current: &str, - new_name: &str, + new_name: PlSmallStr, ) -> Node { let mut leaves = aexpr_to_column_nodes_iter(node, arena); @@ -262,7 +264,7 @@ pub(crate) fn rename_matching_aexpr_leaf_names( // we convert to expression as we cannot easily copy the aexpr. let mut new_expr = node_to_expr(node, arena); new_expr = new_expr.map_expr(|e| match e { - Expr::Column(name) if &*name == current => Expr::Column(ColumnName::from(new_name)), + Expr::Column(name) if &*name == current => Expr::Column(new_name.clone()), e => e, }); to_aexpr(new_expr, arena).expect("infallible") @@ -294,18 +296,18 @@ pub fn expressions_to_schema( pub fn aexpr_to_leaf_names_iter( node: Node, arena: &Arena, -) -> impl Iterator> + '_ { +) -> impl Iterator + '_ { aexpr_to_column_nodes_iter(node, arena).map(|node| match arena.get(node.0) { AExpr::Column(name) => name.clone(), _ => unreachable!(), }) } -pub fn aexpr_to_leaf_names(node: Node, arena: &Arena) -> Vec> { +pub fn aexpr_to_leaf_names(node: Node, arena: &Arena) -> Vec { aexpr_to_leaf_names_iter(node, arena).collect() } -pub fn aexpr_to_leaf_name(node: Node, arena: &Arena) -> Arc { +pub fn aexpr_to_leaf_name(node: Node, arena: &Arena) -> PlSmallStr { aexpr_to_leaf_names_iter(node, arena).next().unwrap() } diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index bb6f87bda11a..118573940820 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -38,7 +38,6 @@ once_cell = { workspace = true } pyo3 = { workspace = true, features = ["abi3-py38", "chrono", "extension-module", "multiple-pymethods"] } recursive = { workspace = true } serde_json = { workspace = true, optional = true } -smartstring = { workspace = true } thiserror = { workspace = true } [dependencies.polars] @@ -118,7 +117,7 @@ parquet = ["polars/parquet", "polars-parquet"] ipc = ["polars/ipc"] ipc_streaming = ["polars/ipc_streaming"] is_in = ["polars/is_in"] -json = ["polars/serde", "serde_json", "polars/json"] +json = ["polars/serde", "serde_json", "polars/json", "polars-utils/serde"] trigonometry = ["polars/trigonometry"] sign = ["polars/sign"] asof_join = ["polars/asof_join"] diff --git a/crates/polars-python/src/batched_csv.rs b/crates/polars-python/src/batched_csv.rs index 2f5159bc8402..1a688ba8ba1a 100644 --- a/crates/polars-python/src/batched_csv.rs +++ b/crates/polars-python/src/batched_csv.rs @@ -61,7 +61,7 @@ impl PyBatchedCsv { let null_values = null_values.map(|w| w.0); let eol_char = eol_char.as_bytes()[0]; let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); let quote_char = if let Some(s) = quote_char { @@ -79,7 +79,7 @@ impl PyBatchedCsv { .iter() .map(|(name, dtype)| { let dtype = dtype.0.clone(); - Field::new(name, dtype) + Field::new((&**name).into(), dtype) }) .collect::() }); @@ -102,7 +102,7 @@ impl PyBatchedCsv { .with_projection(projection.map(Arc::new)) .with_rechunk(rechunk) .with_chunk_size(chunk_size) - .with_columns(columns.map(Arc::from)) + .with_columns(columns.map(|x| x.into_iter().map(PlSmallStr::from_string).collect())) .with_n_threads(n_threads) .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new)) .with_low_memory(low_memory) diff --git a/crates/polars-python/src/conversion/any_value.rs b/crates/polars-python/src/conversion/any_value.rs index 088d2e430f99..3cea8cc91fc0 100644 --- a/crates/polars-python/src/conversion/any_value.rs +++ b/crates/polars-python/src/conversion/any_value.rs @@ -5,7 +5,7 @@ use polars::chunked_array::object::PolarsObjectSafe; #[cfg(feature = "object")] use polars::datatypes::OwnedObject; use polars::datatypes::{DataType, Field, PlHashMap, TimeUnit}; -use polars::prelude::{AnyValue, Series}; +use polars::prelude::{AnyValue, PlSmallStr, Series}; use polars_core::export::chrono::{NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike}; use polars_core::utils::any_values_to_supertype_and_n_dtypes; use polars_core::utils::arrow::temporal_conversions::date32_to_date; @@ -289,7 +289,10 @@ pub(crate) fn py_object_to_any_value<'py>( } if ob.is_empty()? { - Ok(AnyValue::List(Series::new_empty("", &DataType::Null))) + Ok(AnyValue::List(Series::new_empty( + PlSmallStr::const_default(), + &DataType::Null, + ))) } else if ob.is_instance_of::() | ob.is_instance_of::() { const INFER_SCHEMA_LENGTH: usize = 25; @@ -320,7 +323,7 @@ pub(crate) fn py_object_to_any_value<'py>( avs.push(av) } - let s = Series::from_any_values_and_dtype("", &avs, &dtype, strict) + let s = Series::from_any_values_and_dtype(PlSmallStr::const_default(), &avs, &dtype, strict) .map_err(|e| { PyTypeError::new_err(format!( "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types." @@ -348,7 +351,7 @@ pub(crate) fn py_object_to_any_value<'py>( let key = k.extract::>()?; let val = py_object_to_any_value(&v, strict)?; let dtype = val.dtype(); - keys.push(Field::new(&key, dtype)); + keys.push(Field::new(key.as_ref().into(), dtype)); vals.push(val) } Ok(AnyValue::StructOwned(Box::new((vals, keys)))) diff --git a/crates/polars-python/src/conversion/chunked_array.rs b/crates/polars-python/src/conversion/chunked_array.rs index abeb4fa728e8..3a69d61f7dd1 100644 --- a/crates/polars-python/src/conversion/chunked_array.rs +++ b/crates/polars-python/src/conversion/chunked_array.rs @@ -64,7 +64,7 @@ impl ToPyObject for Wrap<&DatetimeChunked> { let utils = UTILS.bind(py); let convert = utils.getattr(intern!(py, "to_py_datetime")).unwrap(); let time_unit = self.0.time_unit().to_ascii(); - let time_zone = time_zone.to_object(py); + let time_zone = time_zone.as_deref().to_object(py); let iter = self .0 .iter() diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs index d6283597267a..9906f3ee72f3 100644 --- a/crates/polars-python/src/conversion/mod.rs +++ b/crates/polars-python/src/conversion/mod.rs @@ -19,6 +19,7 @@ use polars_core::utils::materialize_dyn_int; use polars_lazy::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::write::StatisticsOptions; +use polars_utils::pl_str::PlSmallStr; use polars_utils::total_ord::{TotalEq, TotalHash}; use pyo3::basic::CompareOp; use pyo3::exceptions::{PyTypeError, PyValueError}; @@ -26,7 +27,6 @@ use pyo3::intern; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyDict, PyList, PySequence}; -use smartstring::alias::String as SmartString; use crate::error::PyPolarsErr; #[cfg(feature = "object")] @@ -110,15 +110,27 @@ pub(crate) fn to_series(py: Python, s: PySeries) -> PyObject { constructor.call1((s,)).unwrap().into_py(py) } +impl<'a> FromPyObject<'a> for Wrap { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + Ok(Wrap((&*ob.extract::()?).into())) + } +} + #[cfg(feature = "csv")] impl<'a> FromPyObject<'a> for Wrap { fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { - if let Ok(s) = ob.extract::() { - Ok(Wrap(NullValues::AllColumnsSingle(s))) - } else if let Ok(s) = ob.extract::>() { - Ok(Wrap(NullValues::AllColumns(s))) - } else if let Ok(s) = ob.extract::>() { - Ok(Wrap(NullValues::Named(s))) + if let Ok(s) = ob.extract::() { + Ok(Wrap(NullValues::AllColumnsSingle((&*s).into()))) + } else if let Ok(s) = ob.extract::>() { + Ok(Wrap(NullValues::AllColumns( + s.into_iter().map(|x| (&*x).into()).collect(), + ))) + } else if let Ok(s) = ob.extract::>() { + Ok(Wrap(NullValues::Named( + s.into_iter() + .map(|(a, b)| ((&*a).into(), (&*b).into())) + .collect(), + ))) } else { Err( PyPolarsErr::Other("could not extract value from null_values argument".into()) @@ -243,7 +255,7 @@ impl ToPyObject for Wrap { DataType::Datetime(tu, tz) => { let datetime_class = pl.getattr(intern!(py, "Datetime")).unwrap(); datetime_class - .call1((tu.to_ascii(), tz.clone())) + .call1((tu.to_ascii(), tz.as_deref())) .unwrap() .into() }, @@ -267,7 +279,9 @@ impl ToPyObject for Wrap { // we should always have an initialized rev_map coming from rust let categories = rev_map.as_ref().unwrap().get_categories(); let class = pl.getattr(intern!(py, "Enum")).unwrap(); - let s = Series::from_arrow("category", categories.to_boxed()).unwrap(); + let s = + Series::from_arrow(PlSmallStr::from_static("category"), categories.to_boxed()) + .unwrap(); let series = to_series(py, s.into()); return class.call1((series,)).unwrap().into(); }, @@ -311,7 +325,7 @@ impl<'py> FromPyObject<'py> for Wrap { let dtype = ob .getattr(intern!(py, "dtype"))? .extract::>()?; - Ok(Wrap(Field::new(&name, dtype.0))) + Ok(Wrap(Field::new((&*name).into(), dtype.0))) } } @@ -393,8 +407,8 @@ impl<'py> FromPyObject<'py> for Wrap { let time_unit = ob.getattr(intern!(py, "time_unit")).unwrap(); let time_unit = time_unit.extract::>()?.0; let time_zone = ob.getattr(intern!(py, "time_zone")).unwrap(); - let time_zone = time_zone.extract()?; - DataType::Datetime(time_unit, time_zone) + let time_zone = time_zone.extract::>()?; + DataType::Datetime(time_unit, time_zone.as_deref().map(|x| x.into())) }, "Duration" => { let time_unit = ob.getattr(intern!(py, "time_unit")).unwrap(); @@ -507,7 +521,7 @@ impl<'py> FromPyObject<'py> for Wrap { let key = key.extract::()?; let val = val.extract::>()?; - Ok(Field::new(&key, val.0)) + Ok(Field::new((&*key).into(), val.0)) }) .collect::>()?, )) @@ -1173,12 +1187,15 @@ pub(crate) fn parse_parquet_compression( Ok(parsed) } -pub(crate) fn strings_to_smartstrings(container: I) -> Vec +pub(crate) fn strings_to_pl_smallstr(container: I) -> Vec where I: IntoIterator, S: AsRef, { - container.into_iter().map(|s| s.as_ref().into()).collect() + container + .into_iter() + .map(|s| PlSmallStr::from_str(s.as_ref())) + .collect() } #[derive(Debug, Copy, Clone)] diff --git a/crates/polars-python/src/dataframe/construction.rs b/crates/polars-python/src/dataframe/construction.rs index ffe187f2e9ee..2fcdea55ab37 100644 --- a/crates/polars-python/src/dataframe/construction.rs +++ b/crates/polars-python/src/dataframe/construction.rs @@ -133,7 +133,7 @@ where { let fields = column_names .into_iter() - .map(|c| Field::new(c, DataType::Unknown(Default::default()))); + .map(|c| Field::new(c.into(), DataType::Unknown(Default::default()))); Schema::from_iter(fields) } diff --git a/crates/polars-python/src/dataframe/export.rs b/crates/polars-python/src/dataframe/export.rs index 6242d1a64496..cfd6feaa0138 100644 --- a/crates/polars-python/src/dataframe/export.rs +++ b/crates/polars-python/src/dataframe/export.rs @@ -71,7 +71,7 @@ impl PyDataFrame { self.df.align_chunks(); Python::with_gil(|py| { let pyarrow = py.import_bound("pyarrow")?; - let names = self.df.get_column_names(); + let names = self.df.get_column_names_str(); let rbs = self .df @@ -92,7 +92,7 @@ impl PyDataFrame { self.df.as_single_chunk_par(); Python::with_gil(|py| { let pyarrow = py.import_bound("pyarrow")?; - let names = self.df.get_column_names(); + let names = self.df.get_column_names_str(); let cat_columns = self .df .get_columns() diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index 7635499b0eb1..043564b20c99 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -18,7 +18,7 @@ use crate::map::dataframe::{ apply_lambda_unknown, apply_lambda_with_bool_out_type, apply_lambda_with_primitive_out_type, apply_lambda_with_string_out_type, }; -use crate::prelude::strings_to_smartstrings; +use crate::prelude::strings_to_pl_smallstr; use crate::series::{PySeries, ToPySeries, ToSeries}; use crate::{PyExpr, PyLazyFrame}; @@ -139,13 +139,13 @@ impl PyDataFrame { /// Get column names pub fn columns(&self) -> Vec<&str> { - self.df.get_column_names() + self.df.get_column_names_str() } /// set column names pub fn set_column_names(&mut self, names: Vec) -> PyResult<()> { self.df - .set_column_names(&names) + .set_column_names(names.iter().map(|x| &**x)) .map_err(PyPolarsErr::from)?; Ok(()) } @@ -246,13 +246,16 @@ impl PyDataFrame { } pub fn select(&self, columns: Vec) -> PyResult { - let df = self.df.select(columns).map_err(PyPolarsErr::from)?; + let df = self + .df + .select(columns.iter().map(|x| &**x)) + .map_err(PyPolarsErr::from)?; Ok(PyDataFrame::new(df)) } pub fn gather(&self, indices: Wrap>) -> PyResult { let indices = indices.0; - let indices = IdxCa::from_vec("", indices); + let indices = IdxCa::from_vec("".into(), indices); let df = self.df.take(&indices).map_err(PyPolarsErr::from)?; Ok(PyDataFrame::new(df)) } @@ -322,7 +325,7 @@ impl PyDataFrame { pub fn with_row_index(&self, name: &str, offset: Option) -> PyResult { let df = self .df - .with_row_index(name, offset) + .with_row_index(name.into(), offset) .map_err(PyPolarsErr::from)?; Ok(df.into()) } @@ -334,9 +337,9 @@ impl PyDataFrame { maintain_order: bool, ) -> PyResult { let gb = if maintain_order { - self.df.group_by_stable(&by) + self.df.group_by_stable(by.iter().map(|x| &**x)) } else { - self.df.group_by(&by) + self.df.group_by(by.iter().map(|x| &**x)) } .map_err(PyPolarsErr::from)?; @@ -384,8 +387,8 @@ impl PyDataFrame { ) -> PyResult { use polars_ops::pivot::UnpivotDF; let args = UnpivotArgsIR { - on: strings_to_smartstrings(on), - index: strings_to_smartstrings(index), + on: strings_to_pl_smallstr(on), + index: strings_to_pl_smallstr(index), value_name: value_name.map(|s| s.into()), variable_name: variable_name.map(|s| s.into()), }; @@ -581,7 +584,7 @@ impl PyDataFrame { } pub fn to_struct(&self, name: &str, invalid_indices: Vec) -> PySeries { - let ca = self.df.clone().into_struct(name); + let ca = self.df.clone().into_struct(name.into()); if !invalid_indices.is_empty() { let mut validity = MutableBitmap::with_capacity(ca.len()); diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs index 10425f724edd..12707e93dd85 100644 --- a/crates/polars-python/src/dataframe/io.rs +++ b/crates/polars-python/src/dataframe/io.rs @@ -70,7 +70,7 @@ impl PyDataFrame { let null_values = null_values.map(|w| w.0); let eol_char = eol_char.as_bytes()[0]; let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied()); @@ -80,7 +80,7 @@ impl PyDataFrame { .iter() .map(|(name, dtype)| { let dtype = dtype.0.clone(); - Field::new(name, dtype) + Field::new((&**name).into(), dtype) }) .collect::() }); @@ -105,7 +105,7 @@ impl PyDataFrame { .with_projection(projection.map(Arc::new)) .with_rechunk(rechunk) .with_chunk_size(chunk_size) - .with_columns(columns.map(Arc::from)) + .with_columns(columns.map(|x| x.into_iter().map(|x| x.into()).collect())) .with_n_threads(n_threads) .with_schema_overwrite(overwrite_dtype.map(Arc::new)) .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new)) @@ -153,7 +153,7 @@ impl PyDataFrame { use EitherRustPythonFile::*; let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); let result = match get_either_file(py_f, false)? { @@ -263,7 +263,7 @@ impl PyDataFrame { memory_map: bool, ) -> PyResult { let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); py_f = read_if_bytesio(py_f); @@ -296,7 +296,7 @@ impl PyDataFrame { rechunk: bool, ) -> PyResult { let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); py_f = read_if_bytesio(py_f); diff --git a/crates/polars-python/src/expr/array.rs b/crates/polars-python/src/expr/array.rs index 01e44208e5ff..f94185d8057c 100644 --- a/crates/polars-python/src/expr/array.rs +++ b/crates/polars-python/src/expr/array.rs @@ -1,10 +1,9 @@ -use std::borrow::Cow; - use polars::prelude::*; use polars_ops::prelude::array::ArrToStructNameGenerator; +use polars_utils::pl_str::PlSmallStr; use pyo3::prelude::*; +use pyo3::pybacked::PyBackedStr; use pyo3::pymethods; -use smartstring::alias::String as SmartString; use crate::expr::PyExpr; @@ -114,7 +113,7 @@ impl PyExpr { Arc::new(move |idx: usize| { Python::with_gil(|py| { let out = lambda.call1(py, (idx,)).unwrap(); - let out: SmartString = out.extract::>(py).unwrap().into(); + let out: PlSmallStr = (&*out.extract::(py).unwrap()).into(); out }) }) as ArrToStructNameGenerator diff --git a/crates/polars-python/src/expr/datetime.rs b/crates/polars-python/src/expr/datetime.rs index 5065ba676cad..69325b03a19f 100644 --- a/crates/polars-python/src/expr/datetime.rs +++ b/crates/polars-python/src/expr/datetime.rs @@ -46,8 +46,12 @@ impl PyExpr { } #[cfg(feature = "timezones")] - fn dt_convert_time_zone(&self, time_zone: TimeZone) -> Self { - self.inner.clone().dt().convert_time_zone(time_zone).into() + fn dt_convert_time_zone(&self, time_zone: String) -> Self { + self.inner + .clone() + .dt() + .convert_time_zone(time_zone.into()) + .into() } fn dt_cast_time_unit(&self, time_unit: Wrap) -> Self { @@ -65,7 +69,7 @@ impl PyExpr { self.inner .clone() .dt() - .replace_time_zone(time_zone, ambiguous.inner, non_existent.0) + .replace_time_zone(time_zone.map(|x| x.into()), ambiguous.inner, non_existent.0) .into() } diff --git a/crates/polars-python/src/expr/general.rs b/crates/polars-python/src/expr/general.rs index cfcfb438fda7..c3490473e63b 100644 --- a/crates/polars-python/src/expr/general.rs +++ b/crates/polars-python/src/expr/general.rs @@ -228,7 +228,7 @@ impl PyExpr { fn value_counts(&self, sort: bool, parallel: bool, name: String, normalize: bool) -> Self { self.inner .clone() - .value_counts(sort, parallel, name, normalize) + .value_counts(sort, parallel, name.as_str(), normalize) .into() } fn unique_counts(&self) -> Self { diff --git a/crates/polars-python/src/expr/list.rs b/crates/polars-python/src/expr/list.rs index 9ab917918b83..cb179eb0e859 100644 --- a/crates/polars-python/src/expr/list.rs +++ b/crates/polars-python/src/expr/list.rs @@ -2,8 +2,8 @@ use std::borrow::Cow; use polars::prelude::*; use polars::series::ops::NullBehavior; +use polars_utils::pl_str::PlSmallStr; use pyo3::prelude::*; -use smartstring::alias::String as SmartString; use crate::conversion::Wrap; use crate::PyExpr; @@ -214,7 +214,7 @@ impl PyExpr { Arc::new(move |idx: usize| { Python::with_gil(|py| { let out = lambda.call1(py, (idx,)).unwrap(); - let out: SmartString = out.extract::>(py).unwrap().into(); + let out: PlSmallStr = out.extract::>(py).unwrap().as_ref().into(); out }) }) as NameGenerator diff --git a/crates/polars-python/src/expr/name.rs b/crates/polars-python/src/expr/name.rs index 6bbda4a6668a..e5be57ac9458 100644 --- a/crates/polars-python/src/expr/name.rs +++ b/crates/polars-python/src/expr/name.rs @@ -1,8 +1,9 @@ use std::borrow::Cow; use polars::prelude::*; +use polars_utils::format_pl_smallstr; +use polars_utils::pl_str::PlSmallStr; use pyo3::prelude::*; -use smartstring::alias::String as SmartString; use crate::PyExpr; @@ -17,9 +18,9 @@ impl PyExpr { .clone() .name() .map(move |name| { - let out = Python::with_gil(|py| lambda.call1(py, (name,))); + let out = Python::with_gil(|py| lambda.call1(py, (name.as_str(),))); match out { - Ok(out) => Ok(out.to_string()), + Ok(out) => Ok(format_pl_smallstr!("{}", out)), Err(e) => Err(PolarsError::ComputeError( format!("Python function in 'name.map' produced an error: {e}.").into(), )), @@ -48,7 +49,7 @@ impl PyExpr { let name_mapper = Arc::new(move |name: &str| { Python::with_gil(|py| { let out = name_mapper.call1(py, (name,)).unwrap(); - let out: SmartString = out.extract::>(py).unwrap().into(); + let out: PlSmallStr = out.extract::>(py).unwrap().as_ref().into(); out }) }) as FieldsNameMapper; diff --git a/crates/polars-python/src/expr/rolling.rs b/crates/polars-python/src/expr/rolling.rs index b854cb4bd89b..81f131e44060 100644 --- a/crates/polars-python/src/expr/rolling.rs +++ b/crates/polars-python/src/expr/rolling.rs @@ -363,81 +363,131 @@ impl PyExpr { UInt8 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(UInt8Chunked::from_slice("", &[v as u8]).into_series()) + Ok(UInt8Chunked::from_slice( + PlSmallStr::const_default(), + &[v as u8], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| UInt8Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + UInt8Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, UInt16 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(UInt16Chunked::from_slice("", &[v as u16]).into_series()) + Ok(UInt16Chunked::from_slice( + PlSmallStr::const_default(), + &[v as u16], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| UInt16Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + UInt16Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, UInt32 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(UInt32Chunked::from_slice("", &[v as u32]).into_series()) + Ok(UInt32Chunked::from_slice( + PlSmallStr::const_default(), + &[v as u32], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| UInt32Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + UInt32Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, UInt64 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(UInt64Chunked::from_slice("", &[v as u64]).into_series()) + Ok(UInt64Chunked::from_slice( + PlSmallStr::const_default(), + &[v as u64], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| UInt64Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + UInt64Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, Int8 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(Int8Chunked::from_slice("", &[v as i8]).into_series()) + Ok(Int8Chunked::from_slice( + PlSmallStr::const_default(), + &[v as i8], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| Int8Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + Int8Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, Int16 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(Int16Chunked::from_slice("", &[v as i16]).into_series()) + Ok(Int16Chunked::from_slice( + PlSmallStr::const_default(), + &[v as i16], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| Int16Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + Int16Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, Int32 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(Int32Chunked::from_slice("", &[v as i32]).into_series()) + Ok(Int32Chunked::from_slice( + PlSmallStr::const_default(), + &[v as i32], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| Int32Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + Int32Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, Int64 => { if is_float { let v = obj.extract::(py).unwrap(); - Ok(Int64Chunked::from_slice("", &[v as i64]).into_series()) + Ok(Int64Chunked::from_slice( + PlSmallStr::const_default(), + &[v as i64], + ) + .into_series()) } else { - obj.extract::(py) - .map(|v| Int64Chunked::from_slice("", &[v]).into_series()) + obj.extract::(py).map(|v| { + Int64Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }) } }, - Float32 => obj - .extract::(py) - .map(|v| Float32Chunked::from_slice("", &[v]).into_series()), - Float64 => obj - .extract::(py) - .map(|v| Float64Chunked::from_slice("", &[v]).into_series()), + Float32 => obj.extract::(py).map(|v| { + Float32Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }), + Float64 => obj.extract::(py).map(|v| { + Float64Chunked::from_slice(PlSmallStr::const_default(), &[v]) + .into_series() + }), dt => panic!("{dt:?} not implemented"), }; diff --git a/crates/polars-python/src/expr/string.rs b/crates/polars-python/src/expr/string.rs index 55f2aa71140b..e238e412dc02 100644 --- a/crates/polars-python/src/expr/string.rs +++ b/crates/polars-python/src/expr/string.rs @@ -17,6 +17,8 @@ impl PyExpr { #[pyo3(signature = (format, strict, exact, cache))] fn str_to_date(&self, format: Option, strict: bool, exact: bool, cache: bool) -> Self { + let format = format.map(|x| x.into()); + let options = StrptimeOptions { format, strict, @@ -31,12 +33,15 @@ impl PyExpr { &self, format: Option, time_unit: Option>, - time_zone: Option, + time_zone: Option>, strict: bool, exact: bool, cache: bool, ambiguous: Self, ) -> Self { + let format = format.map(|x| x.into()); + let time_zone = time_zone.map(|x| x.0); + let options = StrptimeOptions { format, strict, @@ -57,6 +62,8 @@ impl PyExpr { #[pyo3(signature = (format, strict, cache))] fn str_to_time(&self, format: Option, strict: bool, cache: bool) -> Self { + let format = format.map(|x| x.into()); + let options = StrptimeOptions { format, strict, diff --git a/crates/polars-python/src/functions/io.rs b/crates/polars-python/src/functions/io.rs index f6da57e5fc3d..3cd75ea68aba 100644 --- a/crates/polars-python/src/functions/io.rs +++ b/crates/polars-python/src/functions/io.rs @@ -56,7 +56,7 @@ fn fields_to_pydict(fields: &Vec, dict: &Bound<'_, PyDict>, py: Python) - } else { Wrap((&field.data_type).into()) }; - dict.set_item(&field.name, dt.to_object(py))?; + dict.set_item(field.name.as_str(), dt.to_object(py))?; } Ok(()) } diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index 51800ed9d4e1..c0f9d0f7152a 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -97,13 +97,7 @@ pub fn as_struct(exprs: Vec) -> PyResult { #[pyfunction] pub fn field(names: Vec) -> PyExpr { - dsl::Expr::Field( - names - .into_iter() - .map(|name| Arc::from(name.as_str())) - .collect(), - ) - .into() + dsl::Expr::Field(names.into_iter().map(|x| x.into()).collect()).into() } #[pyfunction] @@ -254,7 +248,7 @@ pub fn datetime( second: Option, microsecond: Option, time_unit: Wrap, - time_zone: Option, + time_zone: Option>, ambiguous: Option, ) -> PyExpr { let year = year.inner; @@ -265,6 +259,7 @@ pub fn datetime( .map(|e| e.inner) .unwrap_or(dsl::lit(String::from("raise"))); let time_unit = time_unit.0; + let time_zone = time_zone.map(|x| x.0); let args = DatetimeArgs { year, month, diff --git a/crates/polars-python/src/functions/misc.rs b/crates/polars-python/src/functions/misc.rs index 114d93276e9b..2ade770d728e 100644 --- a/crates/polars-python/src/functions/misc.rs +++ b/crates/polars-python/src/functions/misc.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use polars_plan::prelude::*; use pyo3::prelude::*; @@ -52,9 +50,9 @@ pub fn register_plugin_function( Ok(Expr::Function { input: args.to_exprs(), function: FunctionExpr::FfiPlugin { - lib: Arc::from(plugin_path), - symbol: Arc::from(function_name), - kwargs: Arc::from(kwargs), + lib: plugin_path.into(), + symbol: function_name.into(), + kwargs: kwargs.into(), }, options: FunctionOptions { collect_groups, diff --git a/crates/polars-python/src/functions/range.rs b/crates/polars-python/src/functions/range.rs index ce725dda4ca4..b07522650de3 100644 --- a/crates/polars-python/src/functions/range.rs +++ b/crates/polars-python/src/functions/range.rs @@ -34,7 +34,7 @@ pub fn eager_int_range( let start_v: <$T as PolarsNumericType>::Native = lower.extract()?; let end_v: <$T as PolarsNumericType>::Native = upper.extract()?; let step: i64 = step.extract()?; - new_int_range::<$T>(start_v, end_v, step, "literal") + new_int_range::<$T>(start_v, end_v, step, PlSmallStr::from_static("literal")) }); let s = ret.map_err(PyPolarsErr::from)?; @@ -100,13 +100,14 @@ pub fn datetime_range( every: &str, closed: Wrap, time_unit: Option>, - time_zone: Option, + time_zone: Option>, ) -> PyExpr { let start = start.inner; let end = end.inner; let every = Duration::parse(every); let closed = closed.0; let time_unit = time_unit.map(|x| x.0); + let time_zone = time_zone.map(|x| x.0); dsl::datetime_range(start, end, every, closed, time_unit, time_zone).into() } @@ -117,13 +118,14 @@ pub fn datetime_ranges( every: &str, closed: Wrap, time_unit: Option>, - time_zone: Option, + time_zone: Option>, ) -> PyExpr { let start = start.inner; let end = end.inner; let every = Duration::parse(every); let closed = closed.0; let time_unit = time_unit.map(|x| x.0); + let time_zone = time_zone.map(|x| x.0); dsl::datetime_ranges(start, end, every, closed, time_unit, time_zone).into() } diff --git a/crates/polars-python/src/interop/arrow/to_py.rs b/crates/polars-python/src/interop/arrow/to_py.rs index 2581a52f34ce..f42d2541f94e 100644 --- a/crates/polars-python/src/interop/arrow/to_py.rs +++ b/crates/polars-python/src/interop/arrow/to_py.rs @@ -5,7 +5,7 @@ use arrow::ffi; use arrow::record_batch::RecordBatch; use polars::datatypes::CompatLevel; use polars::frame::DataFrame; -use polars::prelude::{ArrayRef, ArrowField}; +use polars::prelude::{ArrayRef, ArrowField, PlSmallStr}; use polars::series::Series; use polars_core::utils::arrow; use polars_error::PolarsResult; @@ -20,7 +20,7 @@ pub(crate) fn to_py_array( pyarrow: &Bound, ) -> PyResult { let schema = Box::new(ffi::export_field_to_c(&ArrowField::new( - "", + PlSmallStr::const_default(), array.data_type().clone(), true, ))); @@ -103,7 +103,7 @@ impl DataFrameStreamIterator { } fn field(&self) -> ArrowField { - ArrowField::new("", self.data_type.clone(), false) + ArrowField::new(PlSmallStr::const_default(), self.data_type.clone(), false) } } diff --git a/crates/polars-python/src/interop/arrow/to_rust.rs b/crates/polars-python/src/interop/arrow/to_rust.rs index 411a683ad778..432605658cbc 100644 --- a/crates/polars-python/src/interop/arrow/to_rust.rs +++ b/crates/polars-python/src/interop/arrow/to_rust.rs @@ -51,7 +51,12 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { .first() .ok_or_else(|| PyPolarsErr::Other("empty table".into()))? .getattr("schema")?; - let names = schema.getattr("names")?.extract::>()?; + let names = schema + .getattr("names")? + .extract::>()? + .into_iter() + .map(PlSmallStr::from_string) + .collect::>(); let dfs = rb .iter() @@ -79,7 +84,7 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { .into_par_iter() .enumerate() .map(|(i, arr)| { - let s = Series::try_from((names[i].as_str(), arr)) + let s = Series::try_from((names[i].clone(), arr)) .map_err(PyPolarsErr::from)?; Ok(s) }) @@ -90,8 +95,8 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { .into_iter() .enumerate() .map(|(i, arr)| { - let s = Series::try_from((names[i].as_str(), arr)) - .map_err(PyPolarsErr::from)?; + let s = + Series::try_from((names[i].clone(), arr)).map_err(PyPolarsErr::from)?; Ok(s) }) .collect::>>() diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 18ecbf13333b..dff78ddca623 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -48,7 +48,7 @@ impl PyLazyFrame { file_cache_ttl: Option, ) -> PyResult { let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); @@ -95,7 +95,7 @@ impl PyLazyFrame { .with_schema_overwrite(schema_overrides.map(|x| Arc::new(x.0))) .with_row_index(row_index) .with_ignore_errors(ignore_errors) - .with_include_file_paths(include_file_paths.map(Arc::from)) + .with_include_file_paths(include_file_paths.map(|x| x.into())) .with_cloud_options(cloud_options) .finish() .map_err(PyPolarsErr::from)?; @@ -150,14 +150,14 @@ impl PyLazyFrame { let separator = separator.as_bytes()[0]; let eol_char = eol_char.as_bytes()[0]; let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| { overwrite_dtype .into_iter() - .map(|(name, dtype)| Field::new(&name, dtype.0)) + .map(|(name, dtype)| Field::new((&*name).into(), dtype.0)) .collect::() }); @@ -205,7 +205,7 @@ impl PyLazyFrame { .with_dtype_overwrite(overwrite_dtype.map(Arc::new)) .with_schema(schema.map(|schema| Arc::new(schema.0))) .with_low_memory(low_memory) - .with_comment_prefix(comment_prefix) + .with_comment_prefix(comment_prefix.map(|x| x.into())) .with_quote_char(quote_char) .with_eol_char(eol_char) .with_rechunk(rechunk) @@ -220,7 +220,7 @@ impl PyLazyFrame { .with_glob(glob) .with_raise_if_empty(raise_if_empty) .with_cloud_options(cloud_options) - .with_include_file_paths(include_file_paths.map(Arc::from)); + .with_include_file_paths(include_file_paths.map(|x| x.into())); if let Some(lambda) = with_schema_modify { let f = |schema: Schema| { @@ -238,7 +238,7 @@ impl PyLazyFrame { Ok(schema .iter_dtypes() .zip(new_names) - .map(|(dtype, name)| Field::from_owned(name.into(), dtype.clone())) + .map(|(dtype, name)| Field::new(name.into(), dtype.clone())) .collect()) }) }; @@ -298,7 +298,7 @@ impl PyLazyFrame { }; let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); let hive_options = HiveOptions { @@ -319,7 +319,7 @@ impl PyLazyFrame { use_statistics, hive_options, glob, - include_file_paths: include_file_paths.map(Arc::from), + include_file_paths: include_file_paths.map(|x| x.into()), }; let lf = if path.is_some() { @@ -351,7 +351,7 @@ impl PyLazyFrame { include_file_paths: Option, ) -> PyResult { let row_index = row_index.map(|(name, offset)| RowIndex { - name: Arc::from(name.as_str()), + name: name.into(), offset, }); @@ -398,7 +398,7 @@ impl PyLazyFrame { #[cfg(feature = "cloud")] cloud_options, hive_options, - include_file_paths: include_file_paths.map(Arc::from), + include_file_paths: include_file_paths.map(|x| x.into()), }; let lf = if let Some(path) = &path { @@ -426,8 +426,11 @@ impl PyLazyFrame { scan_fn: PyObject, pyarrow: bool, ) -> PyResult { - let schema = - Schema::from_iter(schema.into_iter().map(|(name, dt)| Field::new(&name, dt.0))); + let schema = Schema::from_iter( + schema + .into_iter() + .map(|(name, dt)| Field::new((&*name).into(), dt.0)), + ); Ok(LazyFrame::scan_from_python_function(schema, scan_fn, pyarrow).into()) } @@ -911,8 +914,8 @@ impl PyLazyFrame { .coalesce(coalesce) .how(JoinType::AsOf(AsOfOptions { strategy: strategy.0, - left_by: left_by.map(strings_to_smartstrings), - right_by: right_by.map(strings_to_smartstrings), + left_by: left_by.map(strings_to_pl_smallstr), + right_by: right_by.map(strings_to_pl_smallstr), tolerance: tolerance.map(|t| t.0.into_static().unwrap()), tolerance_str: tolerance_str.map(|s| s.into()), })) diff --git a/crates/polars-python/src/lazyframe/visit.rs b/crates/polars-python/src/lazyframe/visit.rs index 36d8e6e4b793..05e35db56f39 100644 --- a/crates/polars-python/src/lazyframe/visit.rs +++ b/crates/polars-python/src/lazyframe/visit.rs @@ -26,7 +26,7 @@ impl From for PyExprIR { fn from(value: ExprIR) -> Self { Self { node: value.node().0, - output_name: value.output_name().into(), + output_name: value.output_name().to_string(), } } } @@ -35,7 +35,7 @@ impl From<&ExprIR> for PyExprIR { fn from(value: &ExprIR) -> Self { Self { node: value.node().0, - output_name: value.output_name().into(), + output_name: value.output_name().to_string(), } } } diff --git a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs index d282e6d528e3..4a6878bc35e5 100644 --- a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs @@ -763,7 +763,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { ignore_nulls, } => ( PyStringFunction::ConcatHorizontal.into_py(py), - delimiter, + delimiter.as_str(), ignore_nulls, ) .to_object(py), @@ -772,7 +772,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { ignore_nulls, } => ( PyStringFunction::ConcatVertical.into_py(py), - delimiter, + delimiter.as_str(), ignore_nulls, ) .to_object(py), @@ -796,7 +796,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { StringFunction::ExtractGroups { dtype, pat } => ( PyStringFunction::ExtractGroups.into_py(py), Wrap(dtype.clone()).to_object(py), - pat, + pat.as_str(), ) .to_object(py), #[cfg(feature = "regex")] @@ -979,7 +979,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { }, #[cfg(feature = "timezones")] TemporalFunction::ConvertTimeZone(time_zone) => { - (PyTemporalFunction::ConvertTimeZone, time_zone).into_py(py) + (PyTemporalFunction::ConvertTimeZone, time_zone.as_str()).into_py(py) }, TemporalFunction::TimeStamp(time_unit) => { (PyTemporalFunction::TimeStamp, Wrap(*time_unit)).into_py(py) @@ -1193,7 +1193,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { parallel, name, normalize, - } => ("value_counts", sort, parallel, name, normalize).to_object(py), + } => ("value_counts", sort, parallel, name.as_str(), normalize).to_object(py), FunctionExpr::UniqueCounts => ("unique_counts",).to_object(py), FunctionExpr::ApproxNUnique => ("approx_n_unique",).to_object(py), FunctionExpr::Coalesce => ("coalesce",).to_object(py), diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 85afcdbf7bb3..7a61f2b2bad3 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -55,11 +55,15 @@ impl PyFileOptions { } #[getter] fn with_columns(&self, py: Python<'_>) -> PyResult { - Ok(self - .inner - .with_columns - .as_ref() - .map_or_else(|| py.None(), |cols| cols.to_object(py))) + Ok(self.inner.with_columns.as_ref().map_or_else( + || py.None(), + |cols| { + cols.iter() + .map(|x| x.as_str()) + .collect::>() + .to_object(py) + }, + )) } #[getter] fn cache(&self, _py: Python<'_>) -> PyResult { @@ -270,10 +274,15 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { .scan_fn .as_ref() .map_or_else(|| py.None(), |s| s.0.clone()), - options - .with_columns - .as_ref() - .map_or_else(|| py.None(), |cols| cols.to_object(py)), + options.with_columns.as_ref().map_or_else( + || py.None(), + |cols| { + cols.iter() + .map(|x| x.as_str()) + .collect::>() + .to_object(py) + }, + ), python_src, match &options.predicate { PythonPredicate::None => py.None(), @@ -472,7 +481,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { }, options.args.join_nulls, options.args.slice, - options.args.suffix.clone(), + options.args.suffix.as_deref(), options.args.coalesce.coalesce(&options.args.how), ) .to_object(py), diff --git a/crates/polars-python/src/map/dataframe.rs b/crates/polars-python/src/map/dataframe.rs index d50adb7404e1..c91353dfff8d 100644 --- a/crates/polars-python/src/map/dataframe.rs +++ b/crates/polars-python/src/map/dataframe.rs @@ -168,10 +168,16 @@ where { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - ChunkedArray::full_null("map", df.height()) + ChunkedArray::full_null(PlSmallStr::from_static("map"), df.height()) } else { let iter = apply_iter(df, py, lambda, init_null_count, skip); - iterator_to_primitive(iter, init_null_count, first_value, "map", df.height()) + iterator_to_primitive( + iter, + init_null_count, + first_value, + PlSmallStr::from_static("map"), + df.height(), + ) } } @@ -185,10 +191,16 @@ pub fn apply_lambda_with_bool_out_type<'a>( ) -> ChunkedArray { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - ChunkedArray::full_null("map", df.height()) + ChunkedArray::full_null(PlSmallStr::from_static("map"), df.height()) } else { let iter = apply_iter(df, py, lambda, init_null_count, skip); - iterator_to_bool(iter, init_null_count, first_value, "map", df.height()) + iterator_to_bool( + iter, + init_null_count, + first_value, + PlSmallStr::from_static("map"), + df.height(), + ) } } @@ -202,10 +214,16 @@ pub fn apply_lambda_with_string_out_type<'a>( ) -> StringChunked { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - ChunkedArray::full_null("map", df.height()) + ChunkedArray::full_null(PlSmallStr::from_static("map"), df.height()) } else { let iter = apply_iter::(df, py, lambda, init_null_count, skip); - iterator_to_string(iter, init_null_count, first_value, "map", df.height()) + iterator_to_string( + iter, + init_null_count, + first_value, + PlSmallStr::from_static("map"), + df.height(), + ) } } @@ -220,7 +238,10 @@ pub fn apply_lambda_with_list_out_type<'a>( ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - Ok(ChunkedArray::full_null("map", df.height())) + Ok(ChunkedArray::full_null( + PlSmallStr::from_static("map"), + df.height(), + )) } else { let mut iters = get_iters_skip(df, init_null_count + skip); let iter = ((init_null_count + skip)..df.height()).map(|_| { @@ -240,7 +261,14 @@ pub fn apply_lambda_with_list_out_type<'a>( Err(e) => panic!("python function failed {e}"), } }); - iterator_to_list(dt, iter, init_null_count, first_value, "map", df.height()) + iterator_to_list( + dt, + iter, + init_null_count, + first_value, + PlSmallStr::from_static("map"), + df.height(), + ) } } diff --git a/crates/polars-python/src/map/lazy.rs b/crates/polars-python/src/map/lazy.rs index 759f1d25f443..f7edcbe3facb 100644 --- a/crates/polars-python/src/map/lazy.rs +++ b/crates/polars-python/src/map/lazy.rs @@ -194,7 +194,7 @@ pub fn map_mul( let output_map = GetOutput::map_field(move |fld| { Ok(match output_type { - Some(ref dt) => Field::new(fld.name(), dt.0.clone()), + Some(ref dt) => Field::new(fld.name().clone(), dt.0.clone()), None => fld.clone(), }) }); diff --git a/crates/polars-python/src/map/mod.rs b/crates/polars-python/src/map/mod.rs index db21681a04a8..b7422a0f9f87 100644 --- a/crates/polars-python/src/map/mod.rs +++ b/crates/polars-python/src/map/mod.rs @@ -9,10 +9,10 @@ use polars::prelude::*; use polars_core::export::rayon::prelude::*; use polars_core::utils::CustomIterTools; use polars_core::POOL; +use polars_utils::pl_str::PlSmallStr; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::PyDict; -use smartstring::alias::String as SmartString; use crate::error::PyPolarsErr; use crate::prelude::ObjectValue; @@ -35,7 +35,7 @@ fn iterator_to_struct<'a>( it: impl Iterator>>, init_null_count: usize, first_value: AnyValue<'a>, - name: &str, + name: PlSmallStr, capacity: usize, ) -> PyResult { let (vals, flds) = match &first_value { @@ -54,11 +54,11 @@ fn iterator_to_struct<'a>( // [ a values ] // [ b values ] // ] - let mut struct_fields: BTreeMap> = BTreeMap::new(); + let mut struct_fields: BTreeMap> = BTreeMap::new(); // As a BTreeMap sorts its keys, we also need to track the original // order of the field names. - let mut field_names_ordered: Vec = Vec::with_capacity(flds.len()); + let mut field_names_ordered: Vec = Vec::with_capacity(flds.len()); // Use the first value and the known null count to initialize the buffers // if we find a new key later on, we make a new entry in the BTree. @@ -96,7 +96,7 @@ fn iterator_to_struct<'a>( let mut buf = Vec::with_capacity(capacity); buf.extend((0..init_null_count + current_len).map(|_| AnyValue::Null)); buf.push(item.0); - let key: SmartString = (&*key).into(); + let key: PlSmallStr = (&*key).into(); field_names_ordered.push(key.clone()); struct_fields.insert(key, buf); }; @@ -118,7 +118,7 @@ fn iterator_to_struct<'a>( let fields = POOL.install(|| { field_names_ordered .par_iter() - .map(|name| Series::new(name, struct_fields.get(name).unwrap())) + .map(|name| Series::new(name.clone(), struct_fields.get(name).unwrap())) .collect::>() }); @@ -132,7 +132,7 @@ fn iterator_to_primitive( it: impl Iterator>, init_null_count: usize, first_value: Option, - name: &str, + name: PlSmallStr, capacity: usize, ) -> ChunkedArray where @@ -164,7 +164,7 @@ fn iterator_to_bool( it: impl Iterator>, init_null_count: usize, first_value: Option, - name: &str, + name: PlSmallStr, capacity: usize, ) -> ChunkedArray { // SAFETY: we know the iterators len. @@ -194,7 +194,7 @@ fn iterator_to_object( it: impl Iterator>, init_null_count: usize, first_value: Option, - name: &str, + name: PlSmallStr, capacity: usize, ) -> ObjectChunked { // SAFETY: we know the iterators len. @@ -223,7 +223,7 @@ fn iterator_to_string>( it: impl Iterator>, init_null_count: usize, first_value: Option, - name: &str, + name: PlSmallStr, capacity: usize, ) -> StringChunked { // SAFETY: we know the iterators len. @@ -252,7 +252,7 @@ fn iterator_to_list( it: impl Iterator>, init_null_count: usize, first_value: Option<&Series>, - name: &str, + name: PlSmallStr, capacity: usize, ) -> PyResult { let mut builder = @@ -269,7 +269,7 @@ fn iterator_to_list( Some(s) => { if s.len() == 0 && s.dtype() != dt { builder - .append_series(&Series::full_null("", 0, dt)) + .append_series(&Series::full_null(PlSmallStr::const_default(), 0, dt)) .unwrap() } else { builder.append_series(&s).map_err(PyPolarsErr::from)? diff --git a/crates/polars-python/src/map/series.rs b/crates/polars-python/src/map/series.rs index 9ec530002429..3afebc16f046 100644 --- a/crates/polars-python/src/map/series.rs +++ b/crates/polars-python/src/map/series.rs @@ -248,7 +248,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { null_count += 1 } } - Ok(Self::full_null(self.name(), self.len()) + Ok(Self::full_null(self.name().clone(), self.len()) .into_series() .into()) } @@ -266,13 +266,25 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { .into_no_null_iter() .skip(init_null_count + skip) .map(|val| call_lambda(py, lambda, val).ok()); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } else { let it = self .into_iter() .skip(init_null_count + skip) .map(|opt_val| opt_val.and_then(|val| call_lambda(py, lambda, val).ok())); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } } @@ -289,7 +301,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -299,7 +311,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -313,7 +325,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -328,7 +340,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -338,7 +350,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -352,7 +364,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -367,7 +379,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -380,7 +392,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -394,7 +406,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -411,7 +423,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { let skip = 1; let lambda = lambda.bind(py); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -423,7 +435,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } else { @@ -438,7 +450,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -475,7 +487,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { }); avs.extend(iter); } - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -488,7 +500,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { ) -> PyResult> { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -499,7 +511,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -513,7 +525,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -541,7 +553,7 @@ where null_count += 1 } } - Ok(Self::full_null(self.name(), self.len()) + Ok(Self::full_null(self.name().clone(), self.len()) .into_series() .into()) } @@ -559,13 +571,25 @@ where .into_no_null_iter() .skip(init_null_count + skip) .map(|val| call_lambda(py, lambda, val).ok()); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } else { let it = self .into_iter() .skip(init_null_count + skip) .map(|opt_val| opt_val.and_then(|val| call_lambda(py, lambda, val).ok())); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } } @@ -582,7 +606,7 @@ where { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -592,7 +616,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -606,7 +630,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -621,7 +645,7 @@ where ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -631,7 +655,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -645,7 +669,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -660,7 +684,7 @@ where ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -671,7 +695,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -685,7 +709,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -702,7 +726,7 @@ where let skip = 1; let lambda = lambda.bind(py); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -714,7 +738,7 @@ where it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } else { @@ -729,7 +753,7 @@ where it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -766,7 +790,7 @@ where }); avs.extend(iter); } - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -779,7 +803,7 @@ where ) -> PyResult> { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -790,7 +814,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -804,7 +828,7 @@ where it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -827,7 +851,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { null_count += 1 } } - Ok(Self::full_null(self.name(), self.len()) + Ok(Self::full_null(self.name().clone(), self.len()) .into_series() .into()) } @@ -845,13 +869,25 @@ impl<'a> ApplyLambda<'a> for StringChunked { .into_no_null_iter() .skip(init_null_count + skip) .map(|val| call_lambda(py, lambda, val).ok()); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } else { let it = self .into_iter() .skip(init_null_count + skip) .map(|opt_val| opt_val.and_then(|val| call_lambda(py, lambda, val).ok())); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } } @@ -868,7 +904,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -878,7 +914,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -892,7 +928,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -907,7 +943,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -917,7 +953,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -931,7 +967,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -946,7 +982,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -957,7 +993,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -971,7 +1007,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -987,7 +1023,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { let skip = 1; let lambda = lambda.bind(py); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -999,7 +1035,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } else { @@ -1014,7 +1050,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -1051,7 +1087,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { }); avs.extend(iter); } - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -1064,7 +1100,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { ) -> PyResult> { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1075,7 +1111,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1089,7 +1125,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1150,7 +1186,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { null_count += 1 } } - Ok(Self::full_null(self.name(), self.len()) + Ok(Self::full_null(self.name().clone(), self.len()) .into_series() .into()) } @@ -1180,7 +1216,13 @@ impl<'a> ApplyLambda<'a> for ListChunked { .unwrap(); call_lambda(py, lambda, python_series_wrapper).ok() }); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } else { let it = self .into_iter() @@ -1198,7 +1240,13 @@ impl<'a> ApplyLambda<'a> for ListChunked { call_lambda(py, lambda, python_series_wrapper).ok() }) }); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } } @@ -1216,7 +1264,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { let skip = usize::from(first_value.is_some()); let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1236,7 +1284,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1260,7 +1308,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1276,7 +1324,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { let skip = usize::from(first_value.is_some()); let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1296,7 +1344,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1320,7 +1368,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1338,7 +1386,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1359,7 +1407,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1383,7 +1431,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1400,7 +1448,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { let pypolars = PyModule::import_bound(py, "polars")?; let lambda = lambda.bind(py); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1412,7 +1460,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } else { @@ -1425,7 +1473,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -1473,7 +1521,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { .map(call_with_value); avs.extend(iter); } - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -1487,7 +1535,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { let skip = usize::from(first_value.is_some()); let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1508,7 +1556,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1532,7 +1580,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1565,7 +1613,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { null_count += 1 } } - Ok(Self::full_null(self.name(), self.len()) + Ok(Self::full_null(self.name().clone(), self.len()) .into_series() .into()) } @@ -1595,7 +1643,13 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { .unwrap(); call_lambda(py, lambda, python_series_wrapper).ok() }); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } else { let it = self .into_iter() @@ -1613,7 +1667,13 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { call_lambda(py, lambda, python_series_wrapper).ok() }) }); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } } @@ -1631,7 +1691,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { let skip = usize::from(first_value.is_some()); let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1651,7 +1711,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1675,7 +1735,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1691,7 +1751,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { let skip = usize::from(first_value.is_some()); let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1711,7 +1771,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1735,7 +1795,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1753,7 +1813,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1774,7 +1834,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1798,7 +1858,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1815,7 +1875,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { let pypolars = PyModule::import_bound(py, "polars")?; let lambda = lambda.bind(py); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1827,7 +1887,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } else { @@ -1840,7 +1900,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -1888,7 +1948,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { .map(call_with_value); avs.extend(iter); } - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -1902,7 +1962,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { let skip = usize::from(first_value.is_some()); let pypolars = PyModule::import_bound(py, "polars")?; if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -1923,7 +1983,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -1947,7 +2007,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -1971,7 +2031,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { null_count += 1 } } - Ok(Self::full_null(self.name(), self.len()) + Ok(Self::full_null(self.name().clone(), self.len()) .into_series() .into()) } @@ -1991,7 +2051,13 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { let out = lambda.call1((object_value.map(|v| &v.inner),)).unwrap(); Some(out) }); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } fn apply_lambda_with_primitive_out_type( @@ -2007,7 +2073,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -2017,7 +2083,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -2031,7 +2097,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2046,7 +2112,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -2056,7 +2122,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -2070,7 +2136,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2085,7 +2151,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -2096,7 +2162,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -2110,7 +2176,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2127,7 +2193,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { let skip = 1; let lambda = lambda.bind(py); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -2139,7 +2205,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } else { @@ -2154,7 +2220,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -2191,7 +2257,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { }); avs.extend(iter); } - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -2204,7 +2270,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { ) -> PyResult> { let skip = usize::from(first_value.is_some()); if init_null_count == self.len() { - Ok(ChunkedArray::full_null(self.name(), self.len())) + Ok(ChunkedArray::full_null(self.name().clone(), self.len())) } else if !self.has_nulls() { let it = self .into_no_null_iter() @@ -2215,7 +2281,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } else { @@ -2229,7 +2295,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2269,7 +2335,13 @@ impl<'a> ApplyLambda<'a> for StructChunked { let out = lambda.call1((Wrap(val),)).unwrap(); Some(out) }); - iterator_to_struct(it, init_null_count, first_value, self.name(), self.len()) + iterator_to_struct( + it, + init_null_count, + first_value, + self.name().clone(), + self.len(), + ) } fn apply_lambda_with_primitive_out_type( @@ -2292,7 +2364,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2313,7 +2385,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2334,7 +2406,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } @@ -2356,7 +2428,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { it, init_null_count, Some(first_value), - self.name(), + self.name().clone(), self.len(), ) } @@ -2379,7 +2451,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { }); avs.extend(iter); - Ok(Series::new(self.name(), &avs)) + Ok(Series::new(self.name().clone(), &avs)) } #[cfg(feature = "object")] @@ -2399,7 +2471,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { it, init_null_count, first_value, - self.name(), + self.name().clone(), self.len(), )) } diff --git a/crates/polars-python/src/on_startup.rs b/crates/polars-python/src/on_startup.rs index 75b0802d9166..3f08f71740b5 100644 --- a/crates/polars-python/src/on_startup.rs +++ b/crates/polars-python/src/on_startup.rs @@ -76,7 +76,7 @@ pub fn register_startup_deps() { } // register object type builder - let object_builder = Box::new(|name: &str, capacity: usize| { + let object_builder = Box::new(|name: PlSmallStr, capacity: usize| { Box::new(ObjectChunkedBuilder::::new(name, capacity)) as Box }); diff --git a/crates/polars-python/src/series/buffers.rs b/crates/polars-python/src/series/buffers.rs index 680cf36b6b75..4a7f8ee27e54 100644 --- a/crates/polars-python/src/series/buffers.rs +++ b/crates/polars-python/src/series/buffers.rs @@ -109,7 +109,7 @@ fn get_buffers_from_primitive( .iter() .map(|arr| arr.with_validity(None)) .collect::>(); - let values = Series::try_from((s.name(), chunks)) + let values = Series::try_from((s.name().clone(), chunks)) .map_err(PyPolarsErr::from)? .into(); @@ -151,7 +151,7 @@ fn get_string_bytes(arr: &Utf8Array) -> PyResult { let values_arr = PrimitiveArray::::try_new(ArrowDataType::UInt8, values_buffer.clone(), None) .map_err(PyPolarsErr::from)?; - let values = Series::from_arrow("", values_arr.to_boxed()) + let values = Series::from_arrow(PlSmallStr::const_default(), values_arr.to_boxed()) .map_err(PyPolarsErr::from)? .into(); Ok(values) @@ -162,7 +162,7 @@ fn get_string_offsets(arr: &Utf8Array) -> PyResult { let offsets_arr = PrimitiveArray::::try_new(ArrowDataType::Int64, offsets_buffer.clone(), None) .map_err(PyPolarsErr::from)?; - let offsets = Series::from_arrow("", offsets_arr.to_boxed()) + let offsets = Series::from_arrow(PlSmallStr::const_default(), offsets_arr.to_boxed()) .map_err(PyPolarsErr::from)? .into(); Ok(offsets) @@ -203,7 +203,9 @@ impl PySeries { }, }; - let s = Series::from_arrow("", arr_boxed).unwrap().into(); + let s = Series::from_arrow(PlSmallStr::const_default(), arr_boxed) + .unwrap() + .into(); Ok(s) } } @@ -355,13 +357,13 @@ fn from_buffers_num_impl( validity: Option, ) -> PyResult { let arr = PrimitiveArray::new(T::PRIMITIVE.into(), data, validity); - let s_result = Series::from_arrow("", arr.to_boxed()); + let s_result = Series::from_arrow(PlSmallStr::const_default(), arr.to_boxed()); let s = s_result.map_err(PyPolarsErr::from)?; Ok(s) } fn from_buffers_bool_impl(data: Bitmap, validity: Option) -> PyResult { let arr = BooleanArray::new(ArrowDataType::Boolean, data, validity); - let s_result = Series::from_arrow("", arr.to_boxed()); + let s_result = Series::from_arrow(PlSmallStr::const_default(), arr.to_boxed()); let s = s_result.map_err(PyPolarsErr::from)?; Ok(s) } @@ -376,7 +378,7 @@ fn from_buffers_string_impl( let arr = Utf8Array::new(ArrowDataType::LargeUtf8, offsets, data, validity); // This is not zero-copy - let s_result = Series::from_arrow("", arr.to_boxed()); + let s_result = Series::from_arrow(PlSmallStr::const_default(), arr.to_boxed()); let s = s_result.map_err(PyPolarsErr::from)?; Ok(s) diff --git a/crates/polars-python/src/series/c_interface.rs b/crates/polars-python/src/series/c_interface.rs index 29561ed7807c..f1478a0b5b09 100644 --- a/crates/polars-python/src/series/c_interface.rs +++ b/crates/polars-python/src/series/c_interface.rs @@ -29,7 +29,7 @@ impl PySeries { }) .collect::>(); - let s = Series::try_from((name, chunks)).map_err(PyPolarsErr::from)?; + let s = Series::try_new(name.into(), chunks).map_err(PyPolarsErr::from)?; Ok(s.into()) } @@ -54,7 +54,11 @@ unsafe fn export_chunk( let out_ptr = out_ptr as *mut arrow::ffi::ArrowArray; *out_ptr = c_array; - let field = ArrowField::new(s.name(), s.dtype().to_arrow(CompatLevel::newest()), true); + let field = ArrowField::new( + s.name().clone(), + s.dtype().to_arrow(CompatLevel::newest()), + true, + ); let c_schema = arrow::ffi::export_field_to_c(&field); let out_schema_ptr = out_schema_ptr as *mut arrow::ffi::ArrowSchema; diff --git a/crates/polars-python/src/series/comparison.rs b/crates/polars-python/src/series/comparison.rs index 8ebd85021463..7064edb7698a 100644 --- a/crates/polars-python/src/series/comparison.rs +++ b/crates/polars-python/src/series/comparison.rs @@ -227,7 +227,10 @@ macro_rules! impl_decimal { #[pymethods] impl PySeries { fn $name(&self, rhs: PyDecimal) -> PyResult { - let rhs = Series::new("decimal", &[AnyValue::Decimal(rhs.0, rhs.1)]); + let rhs = Series::new( + PlSmallStr::from_static("decimal"), + &[AnyValue::Decimal(rhs.0, rhs.1)], + ); let s = self.series.$method(&rhs).map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } diff --git a/crates/polars-python/src/series/construction.rs b/crates/polars-python/src/series/construction.rs index c8361e7bb837..985495dd7077 100644 --- a/crates/polars-python/src/series/construction.rs +++ b/crates/polars-python/src/series/construction.rs @@ -52,7 +52,9 @@ fn mmap_numpy_array( let vals = unsafe { array.as_slice().unwrap() }; let arr = unsafe { arrow::ffi::mmap::slice_and_owner(vals, array.to_object(py)) }; - Series::from_arrow(name, arr.to_boxed()).unwrap().into() + Series::from_arrow(name.into(), arr.to_boxed()) + .unwrap() + .into() } #[pymethods] @@ -61,7 +63,7 @@ impl PySeries { fn new_bool(py: Python, name: &str, array: &Bound>, _strict: bool) -> Self { let array = array.readonly(); let vals = array.as_slice().unwrap(); - py.allow_threads(|| Series::new(name, vals).into()) + py.allow_threads(|| Series::new(name.into(), vals).into()) } #[staticmethod] @@ -73,7 +75,7 @@ impl PySeries { .iter() .map(|&val| if f32::is_nan(val) { None } else { Some(val) }) .collect_trusted(); - ca.with_name(name).into_series().into() + ca.with_name(name.into()).into_series().into() } else { mmap_numpy_array(py, name, array) } @@ -88,7 +90,7 @@ impl PySeries { .iter() .map(|&val| if f64::is_nan(val) { None } else { Some(val) }) .collect_trusted(); - ca.with_name(name).into_series().into() + ca.with_name(name.into()).into_series().into() } else { mmap_numpy_array(py, name, array) } @@ -100,7 +102,7 @@ impl PySeries { #[staticmethod] fn new_opt_bool(name: &str, values: &Bound, _strict: bool) -> PyResult { let len = values.len()?; - let mut builder = BooleanChunkedBuilder::new(name, len); + let mut builder = BooleanChunkedBuilder::new(name.into(), len); for res in values.iter()? { let value = res?; @@ -125,7 +127,7 @@ where T::Native: FromPyObject<'a>, { let len = values.len()?; - let mut builder = PrimitiveChunkedBuilder::::new(name, len); + let mut builder = PrimitiveChunkedBuilder::::new(name.into(), len); for res in values.iter()? { let value = res?; @@ -175,7 +177,7 @@ impl PySeries { .map(|v| py_object_to_any_value(&(v?).as_borrowed(), strict)) .collect::>>(); let result = any_values_result.and_then(|avs| { - let s = Series::from_any_values(name, avs.as_slice(), strict).map_err(|e| { + let s = Series::from_any_values(name.into(), avs.as_slice(), strict).map_err(|e| { PyTypeError::new_err(format!( "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types." )) @@ -213,19 +215,20 @@ impl PySeries { .iter()? .map(|v| py_object_to_any_value(&(v?).as_borrowed(), strict)) .collect::>>()?; - let s = Series::from_any_values_and_dtype(name, any_values.as_slice(), &dtype.0, strict) - .map_err(|e| { - PyTypeError::new_err(format!( - "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types." - )) - })?; + let s = + Series::from_any_values_and_dtype(name.into(), any_values.as_slice(), &dtype.0, strict) + .map_err(|e| { + PyTypeError::new_err(format!( + "{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types." + )) + })?; Ok(s.into()) } #[staticmethod] fn new_str(name: &str, values: &Bound, _strict: bool) -> PyResult { let len = values.len()?; - let mut builder = StringChunkedBuilder::new(name, len); + let mut builder = StringChunkedBuilder::new(name.into(), len); for res in values.iter()? { let value = res?; @@ -245,7 +248,7 @@ impl PySeries { #[staticmethod] fn new_binary(name: &str, values: &Bound, _strict: bool) -> PyResult { let len = values.len()?; - let mut builder = BinaryChunkedBuilder::new(name, len); + let mut builder = BinaryChunkedBuilder::new(name.into(), len); for res in values.iter()? { let value = res?; @@ -277,7 +280,7 @@ impl PySeries { )); } } - Ok(Series::new(name, series).into()) + Ok(Series::new(name.into(), series).into()) } #[staticmethod] @@ -303,7 +306,7 @@ impl PySeries { }); // Object builder must be registered. This is done on import. let ca = ObjectChunked::::new_from_vec_and_validity( - name, + name.into(), values, validity.into(), ); @@ -317,7 +320,7 @@ impl PySeries { #[staticmethod] fn new_null(name: &str, values: &Bound, _strict: bool) -> PyResult { let len = values.len()?; - Ok(Series::new_null(name, len).into()) + Ok(Series::new_null(name.into(), len).into()) } #[staticmethod] @@ -329,7 +332,7 @@ impl PySeries { let array = arr.as_any().downcast_ref::().unwrap(); let fast_explode = array.offsets().as_slice().windows(2).all(|w| w[0] != w[1]); - let mut out = ListChunked::with_chunk(name, array.clone()); + let mut out = ListChunked::with_chunk(name.into(), array.clone()); if fast_explode { out.set_fast_explode() } @@ -337,7 +340,7 @@ impl PySeries { }, _ => { let series: Series = - std::convert::TryFrom::try_from((name, arr)).map_err(PyPolarsErr::from)?; + Series::try_new(name.into(), arr).map_err(PyPolarsErr::from)?; Ok(series.into()) }, } diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index 63c1caeb71ee..9d2c0a1de98e 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -191,11 +191,11 @@ impl PySeries { } pub fn name(&self) -> &str { - self.series.name() + self.series.name().as_str() } fn rename(&mut self, name: &str) { - self.series.rename(name); + self.series.rename(name.into()); } fn dtype(&self, py: Python) -> PyObject { @@ -333,7 +333,9 @@ impl PySeries { if skip_nulls && (series.null_count() == series.len()) { if let Some(output_type) = output_type { - return Ok(Series::full_null(series.name(), series.len(), &output_type.0).into()); + return Ok( + Series::full_null(series.name().clone(), series.len(), &output_type.0).into(), + ); } let msg = "The output type of the 'map_elements' function cannot be determined.\n\ The function was never called because 'skip_nulls=True' and all values are null.\n\ @@ -389,7 +391,7 @@ impl PySeries { }, }); avs.extend(iter); - return Ok(Series::new(self.name(), &avs).into()); + return Ok(Series::new(self.series.name().clone(), &avs).into()); } let out = match output_type { @@ -720,7 +722,7 @@ impl PySeries { ) -> PyResult { let out = self .series - .value_counts(sort, parallel, name, normalize) + .value_counts(sort, parallel, name.into(), normalize) .map_err(PyPolarsErr::from)?; Ok(out.into()) } diff --git a/crates/polars-python/src/series/import.rs b/crates/polars-python/src/series/import.rs index 5908ccdf6d11..b38e8c9573da 100644 --- a/crates/polars-python/src/series/import.rs +++ b/crates/polars-python/src/series/import.rs @@ -114,7 +114,7 @@ pub(crate) fn import_stream_pycapsule(capsule: &Bound) -> PyResult::from_vec_validity(self.name(), av, validity); + let ca = ChunkedArray::<$type>::from_vec_validity( + self.series.name().clone(), + av, + validity, + ); PySeries::new(ca.into_series()) }, Err(e) => { diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml index 29febbfc149c..c959694214ce 100644 --- a/crates/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -16,6 +16,7 @@ polars-lazy = { workspace = true, features = ["abs", "binary_encoding", "concat_ polars-ops = { workspace = true } polars-plan = { workspace = true } polars-time = { workspace = true } +polars-utils = { workspace = true } hex = { workspace = true } once_cell = { workspace = true } diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index b131ae805339..6c8571ecebfd 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -8,6 +8,7 @@ use polars_lazy::prelude::*; use polars_ops::frame::JoinCoalesce; use polars_plan::dsl::function_expr::StructFunction; use polars_plan::prelude::*; +use polars_utils::format_pl_smallstr; use sqlparser::ast::{ BinaryOperator, CreateTable, Distinct, ExcludeSelectItem, Expr as SQLExpr, FunctionArg, GroupByExpr, Ident, JoinConstraint, JoinOperator, ObjectName, ObjectType, Offset, OrderBy, @@ -32,10 +33,10 @@ pub struct TableInfo { } struct SelectModifiers { - exclude: PlHashSet, // SELECT * EXCLUDE - ilike: Option, // SELECT * ILIKE - rename: PlHashMap, // SELECT * RENAME - replace: Vec, // SELECT * REPLACE + exclude: PlHashSet, // SELECT * EXCLUDE + ilike: Option, // SELECT * ILIKE + rename: PlHashMap, // SELECT * RENAME + replace: Vec, // SELECT * REPLACE } impl SelectModifiers { fn matches_ilike(&self, s: &str) -> bool { @@ -47,7 +48,7 @@ impl SelectModifiers { fn renamed_cols(&self) -> Vec { self.rename .iter() - .map(|(before, after)| col(before).alias(after)) + .map(|(before, after)| col(before.clone()).alias(after.clone())) .collect() } } @@ -380,12 +381,12 @@ impl SQLContext { .join_nulls(true); let lf_schema = self.get_frame_schema(&mut lf)?; - let lf_cols: Vec<_> = lf_schema.iter_names().map(|nm| col(nm)).collect(); + let lf_cols: Vec<_> = lf_schema.iter_names().map(|nm| col(nm.clone())).collect(); let joined_tbl = match quantifier { SetQuantifier::ByName => join.on(lf_cols).finish(), SetQuantifier::Distinct | SetQuantifier::None => { let rf_schema = self.get_frame_schema(&mut rf)?; - let rf_cols: Vec<_> = rf_schema.iter_names().map(|nm| col(nm)).collect(); + let rf_cols: Vec<_> = rf_schema.iter_names().map(|nm| col(nm.clone())).collect(); if lf_cols.len() != rf_cols.len() { polars_bail!(SQLInterface: "{} requires equal number of columns in each table (use '{} BY NAME' to combine mismatched tables)", op_name, op_name) } @@ -470,7 +471,7 @@ impl SQLContext { let plan = plan .split('\n') .collect::() - .with_name("Logical Plan"); + .with_name(PlSmallStr::from_static("Logical Plan")); let df = DataFrame::new(vec![plan])?; Ok(df.lazy()) }, @@ -480,7 +481,7 @@ impl SQLContext { // SHOW TABLES fn execute_show_tables(&mut self, _: &Statement) -> PolarsResult { - let tables = Series::new("name", self.get_tables()); + let tables = Series::new("name".into(), self.get_tables()); let df = DataFrame::new(vec![tables])?; Ok(df.lazy()) } @@ -592,7 +593,9 @@ impl SQLContext { }, )? }, - JoinOperator::CrossJoin => lf.cross_join(rf, Some(format!(":{}", r_name))), + JoinOperator::CrossJoin => { + lf.cross_join(rf, Some(format_pl_smallstr!(":{}", r_name))) + }, join_type => { polars_bail!(SQLInterface: "join type '{:?}' not currently supported", join_type) }, @@ -687,7 +690,7 @@ impl SQLContext { if matches!(&**e, Expr::Agg(_) | Expr::Len | Expr::Literal(_)) => {}, Expr::Alias(e, _) if matches!(&**e, Expr::Column(_)) => { if let Expr::Column(name) = &**e { - group_by_keys.push(col(name)); + group_by_keys.push(col(name.clone())); } }, _ => { @@ -773,7 +776,7 @@ impl SQLContext { .map(|e| { let expr = parse_sql_expr(e, self, schema.as_deref())?; if let Expr::Column(name) = expr { - Ok(name.to_string()) + Ok(name.clone()) } else { Err(polars_err!(SQLSyntax:"DISTINCT ON only supports column names")) } @@ -782,7 +785,7 @@ impl SQLContext { // DISTINCT ON has to apply the ORDER BY before the operation. lf = self.process_order_by(lf, &query.order_by, None)?; - return Ok(lf.unique_stable(Some(cols), UniqueKeepStrategy::First)); + return Ok(lf.unique_stable(Some(cols.clone()), UniqueKeepStrategy::First)); }, None => lf, }; @@ -804,7 +807,7 @@ impl SQLContext { }, SelectItem::ExprWithAlias { expr, alias } => { let expr = parse_sql_expr(expr, self, Some(schema))?; - Ok(vec![expr.alias(&alias.value)]) + Ok(vec![expr.alias(PlSmallStr::from_str(alias.value.as_str()))]) }, SelectItem::QualifiedWildcard(obj_name, wildcard_options) => self .process_qualified_wildcard( @@ -816,7 +819,7 @@ impl SQLContext { SelectItem::Wildcard(wildcard_options) => { let cols = schema .iter_names() - .map(|name| col(name)) + .map(|name| col(name.clone())) .collect::>(); self.process_wildcard_additional_options( @@ -980,14 +983,14 @@ impl SQLContext { } => { if let Some(alias) = alias { let table_name = alias.name.value.clone(); - let column_names: Vec> = alias + let column_names: Vec> = alias .columns .iter() .map(|c| { if c.value.is_empty() { None } else { - Some(c.value.as_str()) + Some(PlSmallStr::from_str(c.value.as_str())) } }) .collect(); @@ -1009,8 +1012,8 @@ impl SQLContext { ); } let column_series: Vec = column_values - .iter() - .zip(column_names.iter()) + .into_iter() + .zip(column_names) .map(|(s, name)| { if let Some(name) = name { s.clone().with_name(name) @@ -1076,7 +1079,7 @@ impl SQLContext { return Ok(lf); } let schema = self.get_frame_schema(&mut lf)?; - let columns_iter = schema.iter_names().map(|e| col(e)); + let columns_iter = schema.iter_names().map(|e| col(e.clone())); let order_by = order_by.as_ref().unwrap().exprs.clone(); let mut descending = Vec::with_capacity(order_by.len()); @@ -1154,7 +1157,8 @@ impl SQLContext { .. } = expr.deref() { - projection_overrides.insert(alias.as_ref(), col(name).alias(alias)); + projection_overrides + .insert(alias.as_ref(), col(name.clone()).alias(alias.clone())); } else if !is_agg_or_window && !group_by_keys_schema.contains(alias) { projection_aliases.insert(alias.as_ref()); } @@ -1199,7 +1203,7 @@ impl SQLContext { { projection_expr.clone() } else { - col(name) + col(name.clone()) } }) .collect::>(); @@ -1317,7 +1321,8 @@ impl SQLContext { RenameSelectItem::Multiple(renames) => renames.iter().collect(), }; for rn in renames { - let (before, after) = (rn.ident.value.clone(), rn.alias.value.clone()); + let (before, after) = (rn.ident.value.as_str(), rn.alias.value.as_str()); + let (before, after) = (PlSmallStr::from_str(before), PlSmallStr::from_str(after)); if before != after { modifiers.rename.insert(before, after); } @@ -1399,22 +1404,25 @@ fn expand_exprs(expr: Expr, schema: &SchemaRef) -> Vec { match expr { Expr::Wildcard => schema .iter_names() - .map(|name| col(name)) + .map(|name| col(name.clone())) .collect::>(), - Expr::Column(nm) if is_regex_colname(nm.clone()) => { + Expr::Column(nm) if is_regex_colname(nm.as_str()) => { let rx = regex::Regex::new(&nm).unwrap(); schema .iter_names() .filter(|name| rx.is_match(name)) - .map(|name| col(name)) + .map(|name| col(name.clone())) .collect::>() }, - Expr::Columns(names) => names.iter().map(|name| col(name)).collect::>(), + Expr::Columns(names) => names + .iter() + .map(|name| col(name.clone())) + .collect::>(), _ => vec![expr], } } -fn is_regex_colname(nm: ColumnName) -> bool { +fn is_regex_colname(nm: &str) -> bool { nm.starts_with('^') && nm.ends_with('$') } @@ -1491,7 +1499,7 @@ fn process_join_constraint( let right_names = tbl_right.schema.iter_names().collect::>(); let on = left_names .intersection(&right_names) - .map(|name| col(name)) + .map(|&name| col(name.clone())) .collect::>(); if on.is_empty() { polars_bail!(SQLInterface: "no common columns found for NATURAL JOIN") diff --git a/crates/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs index 0124a5409f7d..3b315f2e8f3b 100644 --- a/crates/polars-sql/src/functions.rs +++ b/crates/polars-sql/src/functions.rs @@ -10,6 +10,7 @@ use polars_plan::dsl::{coalesce, concat_str, len, max_horizontal, min_horizontal use polars_plan::plans::{typed_lit, LiteralValue}; use polars_plan::prelude::LiteralValue::Null; use polars_plan::prelude::{col, cols, lit, StrptimeOptions}; +use polars_utils::pl_str::PlSmallStr; use sqlparser::ast::{ DateTimeField, DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg, FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments, Ident, @@ -983,7 +984,7 @@ impl SQLFunctionVisitor<'_> { parse_extract_date_part( e, &DateTimeField::Custom(Ident { - value: p, + value: p.to_string(), quote_style: None, }), ) @@ -1154,11 +1155,11 @@ impl SQLFunctionVisitor<'_> { Strptime => { let args = extract_args(function)?; match args.len() { - 2 => self.visit_binary(|e, fmt| { + 2 => self.visit_binary(|e, fmt: String| { e.str().strptime( DataType::Datetime(TimeUnit::Microseconds, None), StrptimeOptions { - format: Some(fmt), + format: Some(fmt.into()), ..Default::default() }, lit("latest"), @@ -1274,37 +1275,39 @@ impl SQLFunctionVisitor<'_> { // ---- Columns => { let active_schema = self.active_schema; - self.try_visit_unary(|e: Expr| { - match e { - Expr::Literal(LiteralValue::String(pat)) => { - if "*" == pat { - polars_bail!(SQLSyntax: "COLUMNS('*') is not a valid regex; did you mean COLUMNS(*)?") - }; - let pat = match pat.as_str() { - _ if pat.starts_with('^') && pat.ends_with('$') => pat.to_string(), - _ if pat.starts_with('^') => format!("{}.*$", pat), - _ if pat.ends_with('$') => format!("^.*{}", pat), - _ => format!("^.*{}.*$", pat), - }; - if let Some(active_schema) = &active_schema { - let rx = regex::Regex::new(&pat).unwrap(); - let col_names = active_schema - .iter_names() - .filter(|name| rx.is_match(name)) - .collect::>(); - - Ok(if col_names.len() == 1 { - col(col_names[0]) - } else { - cols(col_names) - }) + self.try_visit_unary(|e: Expr| match e { + Expr::Literal(LiteralValue::String(pat)) => { + if pat == "*" { + polars_bail!( + SQLSyntax: "COLUMNS('*') is not a valid regex; \ + did you mean COLUMNS(*)?" + ) + }; + let pat = match pat.as_str() { + _ if pat.starts_with('^') && pat.ends_with('$') => pat.to_string(), + _ if pat.starts_with('^') => format!("{}.*$", pat), + _ if pat.ends_with('$') => format!("^.*{}", pat), + _ => format!("^.*{}.*$", pat), + }; + if let Some(active_schema) = &active_schema { + let rx = regex::Regex::new(&pat).unwrap(); + let col_names = active_schema + .iter_names() + .filter(|name| rx.is_match(name)) + .cloned() + .collect::>(); + + Ok(if col_names.len() == 1 { + col(col_names.into_iter().next().unwrap()) } else { - Ok(col(&pat)) - } - }, - Expr::Wildcard => Ok(col("*")), - _ => polars_bail!(SQLSyntax: "COLUMNS expects a regex; found {:?}", e), - } + cols(col_names) + }) + } else { + Ok(col(&pat)) + } + }, + Expr::Wildcard => Ok(col("*")), + _ => polars_bail!(SQLSyntax: "COLUMNS expects a regex; found {:?}", e), }) }, @@ -1760,7 +1763,7 @@ impl FromSQLExpr for StrptimeOptions { match expr { SQLExpr::Value(v) => match v { SQLValue::SingleQuotedString(s) => Ok(StrptimeOptions { - format: Some(s.clone()), + format: Some(PlSmallStr::from_str(s)), ..StrptimeOptions::default() }), _ => polars_bail!(SQLInterface: "cannot parse literal {:?}", v), diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index 9374a5fd3229..92934732aa91 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -248,7 +248,7 @@ impl SQLExprVisitor<'_> { }) .collect::>>()?; - Series::from_any_values("", &array_elements, true) + Series::from_any_values(PlSmallStr::const_default(), &array_elements, true) } fn visit_expr(&mut self, expr: &SQLExpr) -> PolarsResult { @@ -1369,15 +1369,15 @@ pub(crate) fn resolve_compound_identifier( if lf.is_some() && name == "*" { return Ok(schema .iter_names() - .map(|name| col(name)) + .map(|name| col(name.clone())) .collect::>()); } else if let Some((_, name, dtype)) = schema.get_full(name) { let resolved = &ctx.resolve_name(&ident_root.value, name); Ok(( - if name != resolved { - col(resolved).alias(name) + if name != resolved.as_str() { + col(resolved).alias(name.clone()) } else { - col(name) + col(name.clone()) }, Some(dtype), )) diff --git a/crates/polars-sql/tests/issues.rs b/crates/polars-sql/tests/issues.rs index 31c0a89e84ff..10ee22db49d3 100644 --- a/crates/polars-sql/tests/issues.rs +++ b/crates/polars-sql/tests/issues.rs @@ -113,7 +113,7 @@ fn iss_8395() -> PolarsResult<()> { // assert that the df only contains [vegetables, seafood] let s = df.column("category")?.unique()?.sort(Default::default())?; - let expected = Series::new("category", &["seafood", "vegetables"]); + let expected = Series::new("category".into(), &["seafood", "vegetables"]); assert!(s.equals(&expected)); Ok(()) } diff --git a/crates/polars-sql/tests/simple_exprs.rs b/crates/polars-sql/tests/simple_exprs.rs index c37e12ed3040..b84c6e681cd2 100644 --- a/crates/polars-sql/tests/simple_exprs.rs +++ b/crates/polars-sql/tests/simple_exprs.rs @@ -4,8 +4,11 @@ use polars_sql::*; use polars_time::Duration; fn create_sample_df() -> DataFrame { - let a = Series::new("a", (1..10000i64).map(|i| i / 100).collect::>()); - let b = Series::new("b", 1..10000i64); + let a = Series::new( + "a".into(), + (1..10000i64).map(|i| i / 100).collect::>(), + ); + let b = Series::new("b".into(), 1..10000i64); DataFrame::new(vec![a, b]).unwrap() } diff --git a/crates/polars-sql/tests/statements.rs b/crates/polars-sql/tests/statements.rs index e5be8e598b60..2657ec443077 100644 --- a/crates/polars-sql/tests/statements.rs +++ b/crates/polars-sql/tests/statements.rs @@ -3,8 +3,8 @@ use polars_lazy::prelude::*; use polars_sql::*; fn create_ctx() -> SQLContext { - let a = Series::new("a", (1..10i64).map(|i| i / 100).collect::>()); - let b = Series::new("b", 1..10i64); + let a = Series::new("a".into(), (1..10i64).map(|i| i / 100).collect::>()); + let b = Series::new("b".into(), 1..10i64); let df = DataFrame::new(vec![a, b]).unwrap().lazy(); let mut ctx = SQLContext::new(); ctx.register("df", df); diff --git a/crates/polars-sql/tests/udf.rs b/crates/polars-sql/tests/udf.rs index 66eb0353b07d..3ccd1c4d6395 100644 --- a/crates/polars-sql/tests/udf.rs +++ b/crates/polars-sql/tests/udf.rs @@ -33,10 +33,10 @@ impl FunctionRegistry for MyFunctionRegistry { #[test] fn test_udfs() -> PolarsResult<()> { let my_custom_sum = UserDefinedFunction::new( - "my_custom_sum", + "my_custom_sum".into(), vec![ - Field::new("a", DataType::Int32), - Field::new("b", DataType::Int32), + Field::new("a".into(), DataType::Int32), + Field::new("b".into(), DataType::Int32), ], GetOutput::same_type(), move |s: &mut [Series]| { @@ -68,10 +68,10 @@ fn test_udfs() -> PolarsResult<()> { // create a new UDF to be registered on the context let my_custom_divide = UserDefinedFunction::new( - "my_custom_divide", + "my_custom_divide".into(), vec![ - Field::new("a", DataType::Int32), - Field::new("b", DataType::Int32), + Field::new("a".into(), DataType::Int32), + Field::new("b".into(), DataType::Int32), ], GetOutput::same_type(), move |s: &mut [Series]| { diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs index 16184645da74..f1ff1973920d 100644 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ b/crates/polars-stream/src/nodes/parquet_source.rs @@ -29,6 +29,7 @@ use polars_plan::plans::FileInfo; use polars_plan::prelude::FileScanOptions; use polars_utils::aliases::PlHashSet; use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; use polars_utils::slice::GetSaferUnchecked; use polars_utils::IdxSize; @@ -1044,7 +1045,7 @@ struct RowGroupDataFetcher { use_statistics: bool, verbose: bool, reader_schema: Arc, - projection: Option>, + projection: Option>, predicate: Option>, slice_range: Option>, memory_prefetch_func: fn(&[u8]) -> (), @@ -1402,7 +1403,7 @@ struct RowGroupDecoder { paths: Arc>, hive_partitions: Option>>, hive_partitions_width: usize, - include_file_paths: Option>, + include_file_paths: Option, projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, row_index: Option, physical_predicate: Option>, @@ -1542,7 +1543,7 @@ impl RowGroupDecoder { // so we create the row index column manually instead of using `df.with_row_index` to // ensure it has the correct number of rows. let mut ca = IdxCa::from_vec( - name, + name.clone(), (offset..offset + projection_height as IdxSize).collect(), ); ca.set_sorted_flag(IsSorted::Ascending); @@ -1565,7 +1566,7 @@ impl RowGroupDecoder { vec![] }; - let file_path_series = self.include_file_paths.as_deref().map(|file_path_col| { + let file_path_series = self.include_file_paths.clone().map(|file_path_col| { StringChunked::full( file_path_col, self.paths[path_index].to_str().unwrap(), @@ -1793,13 +1794,13 @@ fn get_row_group_byte_ranges( /// merged. fn get_row_group_byte_ranges_for_projection<'a>( row_group_metadata: &'a RowGroupMetaData, - columns: &'a [String], + columns: &'a [PlSmallStr], ) -> impl Iterator> + 'a { let row_group_columns = row_group_metadata.columns(); row_group_columns.iter().filter_map(move |rg_col_metadata| { for col_name in columns { - if &rg_col_metadata.descriptor().path_in_schema[0] == col_name { + if rg_col_metadata.descriptor().path_in_schema[0] == col_name { let (offset, len) = rg_col_metadata.byte_range(); let range = (offset as usize)..((offset + len) as usize); return Some(range); @@ -1825,7 +1826,7 @@ fn ensure_metadata_has_projected_fields( let dtype = DataType::from_arrow(&x.data_type, true); (x.name, dtype) }) - .collect::>(); + .collect::>(); for field in projected_fields { let Some(dtype) = schema.remove(&field.name) else { diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index 3b6c7b2bea62..d2a665a505df 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -114,7 +114,10 @@ impl ComputeNode for ReduceNode { .zip(self.output_schema.iter_fields()) .map(|(r, field)| { r.finalize().map(|scalar| { - scalar.into_series(&field.name).cast(&field.dtype).unwrap() + scalar + .into_series(field.name.clone()) + .cast(&field.dtype) + .unwrap() }) }) .try_collect_vec()?; diff --git a/crates/polars-stream/src/nodes/simple_projection.rs b/crates/polars-stream/src/nodes/simple_projection.rs index d4e82dde8ad8..95f002df2889 100644 --- a/crates/polars-stream/src/nodes/simple_projection.rs +++ b/crates/polars-stream/src/nodes/simple_projection.rs @@ -1,16 +1,17 @@ use std::sync::Arc; use polars_core::schema::Schema; +use polars_utils::pl_str::PlSmallStr; use super::compute_node_prelude::*; pub struct SimpleProjectionNode { - columns: Vec, + columns: Vec, input_schema: Arc, } impl SimpleProjectionNode { - pub fn new(columns: Vec, input_schema: Arc) -> Self { + pub fn new(columns: Vec, input_schema: Arc) -> Self { Self { columns, input_schema, @@ -47,7 +48,12 @@ impl ComputeNode for SimpleProjectionNode { while let Ok(morsel) = recv.recv().await { let morsel = morsel.try_map(|df| { // TODO: can this be unchecked? - df.select_with_schema(&slf.columns, &slf.input_schema) + let check_duplicates = true; + df._select_with_schema_impl( + slf.columns.as_slice(), + &slf.input_schema, + check_duplicates, + ) })?; if send.send(morsel).await.is_err() { diff --git a/crates/polars-stream/src/nodes/zip.rs b/crates/polars-stream/src/nodes/zip.rs index ff1e336a178f..3a9290bde59d 100644 --- a/crates/polars-stream/src/nodes/zip.rs +++ b/crates/polars-stream/src/nodes/zip.rs @@ -95,7 +95,7 @@ impl InputHead { } else { self.schema .iter() - .map(|(name, dtype)| Series::full_null(name, len, dtype)) + .map(|(name, dtype)| Series::full_null(name.clone(), len, dtype)) .collect() } } diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index 20aa1cf1486f..8a3e7a1b8ac4 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -40,7 +40,7 @@ fn visualize_plan_rec( PhysNodeKind::InMemorySource { df } => ( format!( "in-memory-source\\ncols: {}", - df.get_column_names().join(", ") + df.get_column_names_owned().join(", ") ), &[][..], ), diff --git a/crates/polars-stream/src/physical_plan/lower_expr.rs b/crates/polars-stream/src/physical_plan/lower_expr.rs index fe98cc8efe4c..b91dbe1419ff 100644 --- a/crates/polars-stream/src/physical_plan/lower_expr.rs +++ b/crates/polars-stream/src/physical_plan/lower_expr.rs @@ -12,17 +12,19 @@ use polars_plan::plans::expr_ir::{ExprIR, OutputName}; use polars_plan::plans::{AExpr, LiteralValue}; use polars_plan::prelude::*; use polars_utils::arena::{Arena, Node}; +use polars_utils::format_pl_smallstr; use polars_utils::itertools::Itertools; +use polars_utils::pl_str::PlSmallStr; use slotmap::SlotMap; use super::{PhysNode, PhysNodeKey, PhysNodeKind}; type IRNodeKey = Node; -fn unique_column_name() -> ColumnName { +fn unique_column_name() -> PlSmallStr { static COUNTER: AtomicU64 = AtomicU64::new(0); let idx = COUNTER.fetch_add(1, Ordering::Relaxed); - format!("__POLARS_STMP_{idx}").into() + format_pl_smallstr!("__POLARS_STMP_{idx}") } pub(crate) struct ExprCache { @@ -379,7 +381,7 @@ fn lower_exprs_with_ctx( let node = build_input_independent_node_with_ctx(&expr_irs, ctx)?; let out_exprs = expr_irs .iter() - .map(|e| ctx.expr_arena.add(AExpr::Column(e.output_name().into()))) + .map(|e| ctx.expr_arena.add(AExpr::Column(e.output_name().clone()))) .collect(); return Ok((node, out_exprs)); } @@ -410,7 +412,8 @@ fn lower_exprs_with_ctx( let (trans_input, trans_exprs) = lower_exprs_with_ctx(input, &[inner], ctx)?; let exploded_name = unique_column_name(); let trans_inner = ctx.expr_arena.add(AExpr::Explode(trans_exprs[0])); - let explode_expr = ExprIR::new(trans_inner, OutputName::Alias(exploded_name.clone())); + let explode_expr = + ExprIR::new(trans_inner, OutputName::Alias(exploded_name.clone())); let output_schema = schema_for_select(trans_input, &[explode_expr.clone()], ctx)?; let node_kind = PhysNodeKind::Select { input: trans_input, @@ -596,19 +599,26 @@ fn lower_exprs_with_ctx( transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); }, }, - AExpr::AnonymousFunction { - .. - } - | AExpr::Function { - .. - } - | AExpr::Len // TODO: this one makes me really sad, make this streaming ASAP. + AExpr::Len => { + let out_name = unique_column_name(); + let expr_ir = ExprIR::new(expr, OutputName::Alias(out_name.clone())); + let output_schema = schema_for_select(input, &[expr_ir.clone()], ctx)?; + let kind = PhysNodeKind::Reduce { + input, + exprs: vec![expr_ir], + }; + let reduce_node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, kind)); + input_nodes.insert(reduce_node_key); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); + }, + AExpr::AnonymousFunction { .. } + | AExpr::Function { .. } | AExpr::Slice { .. } | AExpr::Window { .. } => { let out_name = unique_column_name(); fallback_subset.push(ExprIR::new(expr, OutputName::Alias(out_name.clone()))); transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); - } + }, } } @@ -652,7 +662,7 @@ fn schema_for_select( let output_schema: Schema = exprs .iter() .map(|e| { - let name = e.output_name(); + let name = e.output_name().clone(); let dtype = ctx.expr_arena.get(e.node()).to_dtype( input_schema, Context::Default, @@ -674,10 +684,10 @@ fn build_select_node_with_ctx( } // Are we only selecting simple columns, with the same name? - let all_simple_columns: Option> = exprs + let all_simple_columns: Option> = exprs .iter() .map(|e| match ctx.expr_arena.get(e.node()) { - AExpr::Column(name) if name.as_ref() == e.output_name() => Some(name.to_string()), + AExpr::Column(name) if name == e.output_name() => Some(name.clone()), _ => None, }) .collect(); @@ -701,7 +711,7 @@ fn build_select_node_with_ctx( let trans_expr_irs = exprs .iter() .zip(transformed_exprs) - .map(|(e, te)| ExprIR::new(te, OutputName::Alias(e.output_name().into()))) + .map(|(e, te)| ExprIR::new(te, OutputName::Alias(e.output_name().clone()))) .collect_vec(); let output_schema = schema_for_select(transformed_input, &trans_expr_irs, ctx)?; let node_kind = PhysNodeKind::Select { @@ -735,7 +745,7 @@ pub fn lower_exprs( let trans_expr_irs = exprs .iter() .zip(transformed_exprs) - .map(|(e, te)| ExprIR::new(te, OutputName::Alias(e.output_name().into()))) + .map(|(e, te)| ExprIR::new(te, OutputName::Alias(e.output_name().clone()))) .collect_vec(); Ok((transformed_input, trans_expr_irs)) } diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 161b9b9eb24e..b9693e6c3c56 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -1,10 +1,10 @@ use std::sync::Arc; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; -use polars_core::schema::Schema; +use polars_core::schema::{IndexOfSchema, Schema}; use polars_error::PolarsResult; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; -use polars_plan::plans::{AExpr, ColumnName, IR}; +use polars_plan::plans::{AExpr, IR}; use polars_plan::prelude::SinkType; use polars_utils::arena::{Arena, Node}; use polars_utils::itertools::Itertools; @@ -26,7 +26,7 @@ pub fn lower_ir( let output_schema = IR::schema_with_cache(node, ir_arena, schema_cache); let node_kind = match ir_node { IR::SimpleProjection { input, columns } => { - let columns = columns.iter_names().map(|s| s.to_string()).collect(); + let columns = columns.get_names_owned(); let phys_input = lower_ir( *input, ir_arena, @@ -95,7 +95,7 @@ pub fn lower_ir( let input_schema = &phys_sm[phys_input].output_schema; let mut selectors = PlIndexMap::with_capacity(input_schema.len() + exprs.len()); for name in input_schema.iter_names() { - let col_name: Arc = name.as_str().into(); + let col_name = name.clone(); let col_expr = expr_arena.add(AExpr::Column(col_name.clone())); selectors.insert( name.clone(), @@ -103,7 +103,7 @@ pub fn lower_ir( ); } for expr in exprs { - selectors.insert(expr.output_name().into(), expr); + selectors.insert(expr.output_name().clone(), expr); } let selectors = selectors.into_values().collect_vec(); return super::lower_expr::build_select_node( @@ -145,8 +145,8 @@ pub fn lower_ir( )?; let cols_and_predicate = output_schema .iter_names() + .cloned() .map(|name| { - let name: ColumnName = name.as_str().into(); ExprIR::new( expr_arena.add(AExpr::Column(name.clone())), OutputName::ColumnLhs(name), @@ -200,10 +200,7 @@ pub fn lower_ir( let phys_input = phys_sm.insert(PhysNode::new(schema, node_kind)); node_kind = PhysNodeKind::SimpleProjection { input: phys_input, - columns: projection_schema - .iter_names() - .map(|s| s.to_string()) - .collect(), + columns: projection_schema.get_names_owned(), }; schema = projection_schema.clone(); } diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index 487bff6e13d4..99103343565a 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -17,6 +17,7 @@ mod to_graph; pub use fmt::visualize_plan; use polars_plan::prelude::FileScanOptions; use polars_utils::arena::{Arena, Node}; +use polars_utils::pl_str::PlSmallStr; use slotmap::{Key, SecondaryMap, SlotMap}; pub use to_graph::physical_plan_to_graph; @@ -76,7 +77,7 @@ pub enum PhysNodeKind { SimpleProjection { input: PhysNodeKey, - columns: Vec, + columns: Vec, }, InMemorySink { diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 44e32e6fc348..d0bd342b0f65 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -47,7 +47,7 @@ fn create_stream_expr( struct GraphConversionContext<'a> { phys_sm: &'a SlotMap, - expr_arena: &'a Arena, + expr_arena: &'a mut Arena, graph: Graph, phys_to_graph: SecondaryMap, expr_conversion_state: ExpressionConversionState, @@ -56,7 +56,7 @@ struct GraphConversionContext<'a> { pub fn physical_plan_to_graph( root: PhysNodeKey, phys_sm: &SlotMap, - expr_arena: &Arena, + expr_arena: &mut Arena, ) -> PolarsResult<(Graph, SecondaryMap)> { let expr_depth_limit = get_expr_depth_limit()?; let mut ctx = GraphConversionContext { @@ -138,8 +138,7 @@ fn to_graph_rec<'a>( let mut inputs = Vec::with_capacity(reductions.len()); for e in exprs { - let (red, input_node) = - into_reduction(e.node(), ctx.expr_arena, input_schema)?.expect("invariant"); + let (red, input_node) = into_reduction(e.node(), ctx.expr_arena, input_schema)?; reductions.push(red); let input_phys = diff --git a/crates/polars-stream/src/skeleton.rs b/crates/polars-stream/src/skeleton.rs index d1026dfdd760..20ca189de9e0 100644 --- a/crates/polars-stream/src/skeleton.rs +++ b/crates/polars-stream/src/skeleton.rs @@ -2,7 +2,7 @@ use polars_core::prelude::*; use polars_core::POOL; use polars_expr::planner::{create_physical_expr, get_expr_depth_limit, ExpressionConversionState}; -use polars_plan::plans::{Context, IR}; +use polars_plan::plans::{Context, IRPlan, IR}; use polars_plan::prelude::expr_ir::ExprIR; use polars_plan::prelude::AExpr; use polars_utils::arena::{Arena, Node}; @@ -17,6 +17,15 @@ pub fn run_query( mut ir_arena: Arena, expr_arena: &mut Arena, ) -> PolarsResult { + if let Ok(visual_path) = std::env::var("POLARS_VISUALIZE_IR") { + let plan = IRPlan { + lp_top: node, + lp_arena: ir_arena.clone(), + expr_arena: expr_arena.clone(), + }; + let visualization = plan.display_dot().to_string(); + std::fs::write(visual_path, visualization).unwrap(); + } let mut phys_sm = SlotMap::with_capacity_and_key(ir_arena.len()); let root = crate::physical_plan::build_physical_plan(node, &mut ir_arena, expr_arena, &mut phys_sm)?; diff --git a/crates/polars-time/Cargo.toml b/crates/polars-time/Cargo.toml index a716974f0e76..cbddfbf4eba8 100644 --- a/crates/polars-time/Cargo.toml +++ b/crates/polars-time/Cargo.toml @@ -23,7 +23,6 @@ now = { version = "0.1" } once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true, optional = true } -smartstring = { workspace = true } [dev-dependencies] polars-ops = { workspace = true, features = ["abs"] } @@ -39,7 +38,7 @@ offset_by = [] rolling_window = ["polars-core/rolling_window"] rolling_window_by = ["polars-core/rolling_window_by", "dtype-duration"] fmt = ["polars-core/fmt"] -serde = ["dep:serde", "smartstring/serde"] +serde = ["dep:serde"] temporal = ["polars-core/temporal"] timezones = ["chrono-tz", "dtype-datetime", "polars-core/timezones", "arrow/timezones", "polars-ops/timezones"] diff --git a/crates/polars-time/src/chunkedarray/date.rs b/crates/polars-time/src/chunkedarray/date.rs index 402f01c43017..8132f1ea2bba 100644 --- a/crates/polars-time/src/chunkedarray/date.rs +++ b/crates/polars-time/src/chunkedarray/date.rs @@ -73,11 +73,11 @@ pub trait DateMethods: AsDate { ca.apply_kernel_cast::(&date_to_ordinal) } - fn parse_from_str_slice(name: &str, v: &[&str], fmt: &str) -> DateChunked; + fn parse_from_str_slice(name: PlSmallStr, v: &[&str], fmt: &str) -> DateChunked; } impl DateMethods for DateChunked { - fn parse_from_str_slice(name: &str, v: &[&str], fmt: &str) -> DateChunked { + fn parse_from_str_slice(name: PlSmallStr, v: &[&str], fmt: &str) -> DateChunked { Int32Chunked::from_iter_options( name, v.iter().map(|s| { diff --git a/crates/polars-time/src/chunkedarray/datetime.rs b/crates/polars-time/src/chunkedarray/datetime.rs index de14c83c6e72..0e4adf3094b8 100644 --- a/crates/polars-time/src/chunkedarray/datetime.rs +++ b/crates/polars-time/src/chunkedarray/datetime.rs @@ -25,7 +25,7 @@ fn cast_and_apply< .unwrap(); func(&*arr).unwrap() }); - ChunkedArray::from_chunk_iter(ca.name(), chunks) + ChunkedArray::from_chunk_iter(ca.name().clone(), chunks) } pub trait DatetimeMethods: AsDatetime { @@ -130,7 +130,12 @@ pub trait DatetimeMethods: AsDatetime { ca.apply_kernel_cast::(&f) } - fn parse_from_str_slice(name: &str, v: &[&str], fmt: &str, tu: TimeUnit) -> DatetimeChunked { + fn parse_from_str_slice( + name: PlSmallStr, + v: &[&str], + fmt: &str, + tu: TimeUnit, + ) -> DatetimeChunked { let func = match tu { TimeUnit::Nanoseconds => datetime_to_timestamp_ns, TimeUnit::Microseconds => datetime_to_timestamp_us, @@ -175,7 +180,7 @@ mod test { // NOTE: the values are checked and correct. let dt = DatetimeChunked::from_naive_datetime( - "name", + "name".into(), datetimes.iter().copied(), TimeUnit::Nanoseconds, ); diff --git a/crates/polars-time/src/chunkedarray/rolling_window/dispatch.rs b/crates/polars-time/src/chunkedarray/rolling_window/dispatch.rs index 652629c336a4..427646c43914 100644 --- a/crates/polars-time/src/chunkedarray/rolling_window/dispatch.rs +++ b/crates/polars-time/src/chunkedarray/rolling_window/dispatch.rs @@ -32,7 +32,7 @@ where { polars_ensure!(options.min_periods <= options.window_size, InvalidOperation: "`min_periods` should be <= `window_size`"); if ca.is_empty() { - return Ok(Series::new_empty(ca.name(), ca.dtype())); + return Ok(Series::new_empty(ca.name().clone(), ca.dtype())); } let ca = ca.rechunk(); @@ -55,7 +55,7 @@ where options.fn_params, ), }; - Series::try_from((ca.name(), arr)) + Series::try_from((ca.name().clone(), arr)) } #[cfg(feature = "rolling_window_by")] @@ -80,7 +80,7 @@ where T: PolarsNumericType, { if ca.is_empty() { - return Ok(Series::new_empty(ca.name(), ca.dtype())); + return Ok(Series::new_empty(ca.name().clone(), ca.dtype())); } polars_ensure!(by.null_count() == 0 && ca.null_count() == 0, InvalidOperation: "'Expr.rolling_*_by(...)' not yet supported for series with null values, consider using 'DataFrame.rolling' or 'Expr.rolling'"); polars_ensure!(ca.len() == by.len(), InvalidOperation: "`by` column in `rolling_*_by` must be the same length as values column"); @@ -141,7 +141,7 @@ where Some(sorting_indices.cont_slice().unwrap()), )? }; - Series::try_from((ca.name(), out)) + Series::try_from((ca.name().clone(), out)) } pub trait SeriesOpsTime: AsSeries { diff --git a/crates/polars-time/src/chunkedarray/string/infer.rs b/crates/polars-time/src/chunkedarray/string/infer.rs index 5f0f3d7daf96..f91d0ab40869 100644 --- a/crates/polars-time/src/chunkedarray/string/infer.rs +++ b/crates/polars-time/src/chunkedarray/string/infer.rs @@ -325,11 +325,11 @@ where .map(|opt_val| opt_val.and_then(|val| self.parse(val))); PrimitiveArray::from_trusted_len_iter(iter) }); - ChunkedArray::from_chunk_iter(ca.name(), chunks) + ChunkedArray::from_chunk_iter(ca.name().clone(), chunks) .into_series() .cast(&self.logical_type) .unwrap() - .with_name(ca.name()) + .with_name(ca.name().clone()) } } @@ -444,7 +444,9 @@ pub(crate) fn to_datetime( _ambiguous: &StringChunked, ) -> PolarsResult { match ca.first_non_null() { - None => Ok(Int64Chunked::full_null(ca.name(), ca.len()).into_datetime(tu, tz.cloned())), + None => { + Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned())) + }, Some(idx) => { let subset = ca.slice(idx as i64, ca.len()); let pattern = subset @@ -459,7 +461,8 @@ pub(crate) fn to_datetime( // `tz` has already been validated. ca.set_time_unit_and_time_zone( tu, - tz.cloned().unwrap_or_else(|| "UTC".to_string()), + tz.cloned() + .unwrap_or_else(|| PlSmallStr::from_static("UTC")), )?; Ok(ca) })?, @@ -484,7 +487,7 @@ pub(crate) fn to_datetime( #[cfg(feature = "dtype-date")] pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult { match ca.first_non_null() { - None => Ok(Int32Chunked::full_null(ca.name(), ca.len()).into_date()), + None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()), Some(idx) => { let subset = ca.slice(idx as i64, ca.len()); let pattern = subset diff --git a/crates/polars-time/src/chunkedarray/string/mod.rs b/crates/polars-time/src/chunkedarray/string/mod.rs index 42221b4861f1..f48faee8dd89 100644 --- a/crates/polars-time/src/chunkedarray/string/mod.rs +++ b/crates/polars-time/src/chunkedarray/string/mod.rs @@ -108,7 +108,7 @@ pub trait StringMethods: AsString { (string_ca.len() as f64).sqrt() as usize, ); let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache)); - Ok(ca.with_name(string_ca.name()).into()) + Ok(ca.with_name(string_ca.name().clone()).into()) } #[cfg(feature = "dtype-date")] @@ -143,7 +143,7 @@ pub trait StringMethods: AsString { } None }); - Ok(ca.with_name(string_ca.name()).into()) + Ok(ca.with_name(string_ca.name().clone()).into()) } #[cfg(feature = "dtype-datetime")] @@ -200,7 +200,7 @@ pub trait StringMethods: AsString { } None }) - .with_name(string_ca.name()); + .with_name(string_ca.name().clone()); match (tz_aware, tz) { #[cfg(feature = "timezones")] (false, Some(tz)) => polars_ops::prelude::replace_time_zone( @@ -210,7 +210,10 @@ pub trait StringMethods: AsString { NonExistent::Raise, ), #[cfg(feature = "timezones")] - (true, tz) => Ok(ca.into_datetime(tu, tz.cloned().or_else(|| Some("UTC".to_string())))), + (true, tz) => Ok(ca.into_datetime( + tu, + tz.cloned().or_else(|| Some(PlSmallStr::from_static("UTC"))), + )), _ => Ok(ca.into_datetime(tu, None)), } } @@ -253,7 +256,7 @@ pub trait StringMethods: AsString { unary_elementwise(string_ca, |val| convert.eval(val?, use_cache)) }; - Ok(ca.with_name(string_ca.name()).into()) + Ok(ca.with_name(string_ca.name().clone()).into()) } #[cfg(feature = "dtype-datetime")] @@ -293,10 +296,13 @@ pub trait StringMethods: AsString { ); Ok( unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache)) - .with_name(string_ca.name()) + .with_name(string_ca.name().clone()) .into_datetime( tu, - Some(tz.map(|x| x.to_string()).unwrap_or("UTC".to_string())), + Some( + tz.cloned() + .unwrap_or_else(|| PlSmallStr::from_static("UTC")), + ), ), ) } @@ -332,7 +338,9 @@ pub trait StringMethods: AsString { ); unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache)) }; - let dt = ca.with_name(string_ca.name()).into_datetime(tu, None); + let dt = ca + .with_name(string_ca.name().clone()) + .into_datetime(tu, None); match tz { #[cfg(feature = "timezones")] Some(tz) => polars_ops::prelude::replace_time_zone( diff --git a/crates/polars-time/src/chunkedarray/time.rs b/crates/polars-time/src/chunkedarray/time.rs index 6f3c1ab10c51..c0d267202e12 100644 --- a/crates/polars-time/src/chunkedarray/time.rs +++ b/crates/polars-time/src/chunkedarray/time.rs @@ -20,7 +20,7 @@ pub trait TimeMethods { /// The range from 1,000,000,000 to 1,999,999,999 represents the leap second. fn nanosecond(&self) -> Int32Chunked; - fn parse_from_str_slice(name: &str, v: &[&str], fmt: &str) -> TimeChunked; + fn parse_from_str_slice(name: PlSmallStr, v: &[&str], fmt: &str) -> TimeChunked; } impl TimeMethods for TimeChunked { @@ -49,7 +49,7 @@ impl TimeMethods for TimeChunked { self.apply_kernel_cast::(&time_to_nanosecond) } - fn parse_from_str_slice(name: &str, v: &[&str], fmt: &str) -> TimeChunked { + fn parse_from_str_slice(name: PlSmallStr, v: &[&str], fmt: &str) -> TimeChunked { v.iter() .map(|s| { NaiveTime::parse_from_str(s, fmt) diff --git a/crates/polars-time/src/date_range.rs b/crates/polars-time/src/date_range.rs index b44afebddb32..8f01d687fd83 100644 --- a/crates/polars-time/src/date_range.rs +++ b/crates/polars-time/src/date_range.rs @@ -3,6 +3,7 @@ use chrono::{Datelike, NaiveDateTime, NaiveTime}; use polars_core::chunked_array::temporal::time_to_time64ns; use polars_core::prelude::*; use polars_core::series::IsSorted; +use polars_utils::format_pl_smallstr; use crate::prelude::*; @@ -13,7 +14,7 @@ pub fn in_nanoseconds_window(ndt: &NaiveDateTime) -> bool { /// Create a [`DatetimeChunked`] from a given `start` and `end` date and a given `interval`. pub fn date_range( - name: &str, + name: PlSmallStr, start: NaiveDateTime, end: NaiveDateTime, interval: Duration, @@ -40,7 +41,7 @@ pub fn date_range( #[doc(hidden)] pub fn datetime_range_impl( - name: &str, + name: PlSmallStr, start: i64, end: i64, interval: Duration, @@ -54,7 +55,7 @@ pub fn datetime_range_impl( ); let mut out = match tz { #[cfg(feature = "timezones")] - Some(tz) => out.into_datetime(tu, Some(tz.to_string())), + Some(tz) => out.into_datetime(tu, Some(format_pl_smallstr!("{}", tz))), _ => out.into_datetime(tu, None), }; @@ -64,7 +65,7 @@ pub fn datetime_range_impl( /// Create a [`TimeChunked`] from a given `start` and `end` date and a given `interval`. pub fn time_range( - name: &str, + name: PlSmallStr, start: NaiveTime, end: NaiveTime, interval: Duration, @@ -77,7 +78,7 @@ pub fn time_range( #[doc(hidden)] pub fn time_range_impl( - name: &str, + name: PlSmallStr, start: i64, end: i64, interval: Duration, diff --git a/crates/polars-time/src/group_by/dynamic.rs b/crates/polars-time/src/group_by/dynamic.rs index 659a02ab158c..bc44ae7437a1 100644 --- a/crates/polars-time/src/group_by/dynamic.rs +++ b/crates/polars-time/src/group_by/dynamic.rs @@ -7,10 +7,10 @@ use polars_core::utils::flatten::flatten_par; use polars_core::POOL; use polars_ops::series::SeriesMethods; use polars_utils::idx_vec::IdxVec; +use polars_utils::pl_str::PlSmallStr; use polars_utils::slice::{GetSaferUnchecked, SortedSlice}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use smartstring::alias::String as SmartString; use crate::prelude::*; @@ -21,7 +21,7 @@ struct Wrap(pub T); #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct DynamicGroupOptions { /// Time or index column. - pub index_column: SmartString, + pub index_column: PlSmallStr, /// Start a window at this interval. pub every: Duration, /// Window duration. @@ -55,7 +55,7 @@ impl Default for DynamicGroupOptions { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RollingGroupOptions { /// Time or index column. - pub index_column: SmartString, + pub index_column: PlSmallStr, /// Window duration. pub period: Duration, pub offset: Duration, @@ -225,7 +225,7 @@ impl Wrap<&DataFrame> { )?; let out = out.cast(&Int64).unwrap().cast(&Int32).unwrap(); for k in &mut keys { - if k.name() == UP_NAME || k.name() == LB_NAME { + if k.name().as_str() == UP_NAME || k.name().as_str() == LB_NAME { *k = k.cast(&Int64).unwrap().cast(&Int32).unwrap() } } @@ -243,7 +243,7 @@ impl Wrap<&DataFrame> { )?; let out = out.cast(&Int64).unwrap(); for k in &mut keys { - if k.name() == UP_NAME || k.name() == LB_NAME { + if k.name().as_str() == UP_NAME || k.name().as_str() == LB_NAME { *k = k.cast(&Int64).unwrap() } } @@ -476,21 +476,23 @@ impl Wrap<&DataFrame> { *key = unsafe { key.agg_first(&groups) }; } - let lower = lower_bound.map(|lower| Int64Chunked::new_vec(LB_NAME, lower)); - let upper = upper_bound.map(|upper| Int64Chunked::new_vec(UP_NAME, upper)); + let lower = + lower_bound.map(|lower| Int64Chunked::new_vec(PlSmallStr::from_static(LB_NAME), lower)); + let upper = + upper_bound.map(|upper| Int64Chunked::new_vec(PlSmallStr::from_static(UP_NAME), upper)); if options.label == Label::Left { let mut lower = lower.clone().unwrap(); if by.is_empty() { lower.set_sorted_flag(IsSorted::Ascending) } - dt = lower.with_name(dt.name()); + dt = lower.with_name(dt.name().clone()); } else if options.label == Label::Right { let mut upper = upper.clone().unwrap(); if by.is_empty() { upper.set_sorted_flag(IsSorted::Ascending) } - dt = upper.with_name(dt.name()); + dt = upper.with_name(dt.name().clone()); } if let (true, Some(mut lower), Some(mut upper)) = (options.include_boundaries, lower, upper) @@ -671,7 +673,7 @@ mod test { TimeUnit::Milliseconds, ] { let mut date = StringChunked::new( - "dt", + "dt".into(), [ "2020-01-01 13:45:48", "2020-01-01 16:42:13", @@ -691,7 +693,7 @@ mod test { )? .into_series(); date.set_sorted_flag(IsSorted::Ascending); - let a = Series::new("a", [3, 7, 5, 9, 2, 1]); + let a = Series::new("a".into(), [3, 7, 5, 9, 2, 1]); let df = DataFrame::new(vec![date, a.clone()])?; let (_, _, groups) = df @@ -707,7 +709,7 @@ mod test { .unwrap(); let sum = unsafe { a.agg_sum(&groups) }; - let expected = Series::new("", [3, 10, 15, 24, 11, 1]); + let expected = Series::new("".into(), [3, 10, 15, 24, 11, 1]); assert_eq!(sum, expected); } @@ -717,7 +719,7 @@ mod test { #[test] fn test_rolling_group_by_aggs() -> PolarsResult<()> { let mut date = StringChunked::new( - "dt", + "dt".into(), [ "2020-01-01 13:45:48", "2020-01-01 16:42:13", @@ -738,7 +740,7 @@ mod test { .into_series(); date.set_sorted_flag(IsSorted::Ascending); - let a = Series::new("a", [3, 7, 5, 9, 2, 1]); + let a = Series::new("a".into(), [3, 7, 5, 9, 2, 1]); let df = DataFrame::new(vec![date, a.clone()])?; let (_, _, groups) = df @@ -753,10 +755,13 @@ mod test { ) .unwrap(); - let nulls = Series::new("", [Some(3), Some(7), None, Some(9), Some(2), Some(1)]); + let nulls = Series::new( + "".into(), + [Some(3), Some(7), None, Some(9), Some(2), Some(1)], + ); let min = unsafe { a.agg_min(&groups) }; - let expected = Series::new("", [3, 3, 3, 3, 2, 1]); + let expected = Series::new("".into(), [3, 3, 3, 3, 2, 1]); assert_eq!(min, expected); // Expected for nulls is equality. @@ -764,7 +769,7 @@ mod test { assert_eq!(min, expected); let max = unsafe { a.agg_max(&groups) }; - let expected = Series::new("", [3, 7, 7, 9, 9, 1]); + let expected = Series::new("".into(), [3, 7, 7, 9, 9, 1]); assert_eq!(max, expected); let max = unsafe { nulls.agg_max(&groups) }; @@ -772,21 +777,21 @@ mod test { let var = unsafe { a.agg_var(&groups, 1) }; let expected = Series::new( - "", + "".into(), [0.0, 8.0, 4.000000000000002, 6.666666666666667, 24.5, 0.0], ); assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); let var = unsafe { nulls.agg_var(&groups, 1) }; - let expected = Series::new("", [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); + let expected = Series::new("".into(), [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); let quantile = unsafe { a.agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; - let expected = Series::new("", [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); + let expected = Series::new("".into(), [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); assert_eq!(quantile, expected); let quantile = unsafe { nulls.agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; - let expected = Series::new("", [3.0, 5.0, 5.0, 7.0, 5.5, 1.0]); + let expected = Series::new("".into(), [3.0, 5.0, 5.0, 7.0, 5.5, 1.0]); assert_eq!(quantile, expected); Ok(()) @@ -807,7 +812,7 @@ mod test { .and_utc() .timestamp_millis(); let range = datetime_range_impl( - "date", + "date".into(), start, stop, Duration::parse("30m"), @@ -817,7 +822,7 @@ mod test { )? .into_series(); - let groups = Series::new("groups", ["a", "a", "a", "b", "b", "a", "a"]); + let groups = Series::new("groups".into(), ["a", "a", "a", "b", "b", "a", "a"]); let df = DataFrame::new(vec![range, groups.clone()]).unwrap(); let (time_key, mut keys, groups) = df @@ -861,7 +866,7 @@ mod test { .and_utc() .timestamp_millis(); let range = datetime_range_impl( - "_upper_boundary", + "_upper_boundary".into(), start, stop, Duration::parse("1h"), @@ -886,7 +891,7 @@ mod test { .and_utc() .timestamp_millis(); let range = datetime_range_impl( - "_lower_boundary", + "_lower_boundary".into(), start, stop, Duration::parse("1h"), @@ -927,7 +932,7 @@ mod test { .and_utc() .timestamp_millis(); let range = datetime_range_impl( - "date", + "date".into(), start, stop, Duration::parse("1d"), @@ -937,7 +942,7 @@ mod test { )? .into_series(); - let groups = Series::new("groups", ["a", "a", "a", "b", "b", "a", "a"]); + let groups = Series::new("groups".into(), ["a", "a", "a", "b", "b", "a", "a"]); let df = DataFrame::new(vec![range, groups.clone()]).unwrap(); let (mut time_key, keys, _groups) = df @@ -955,8 +960,8 @@ mod test { }, ) .unwrap(); - time_key.rename(""); - let lower_bound = keys[1].clone().with_name(""); + time_key.rename("".into()); + let lower_bound = keys[1].clone().with_name("".into()); assert!(time_key.equals(&lower_bound)); Ok(()) } diff --git a/crates/polars-time/src/round.rs b/crates/polars-time/src/round.rs index 7fd48a407f51..f67c509c5dc2 100644 --- a/crates/polars-time/src/round.rs +++ b/crates/polars-time/src/round.rs @@ -59,7 +59,7 @@ impl PolarsRound for DatetimeChunked { return Ok(out?.into_datetime(self.time_unit(), self.time_zone().clone())); } } else { - return Ok(Int64Chunked::full_null(self.name(), self.len()) + return Ok(Int64Chunked::full_null(self.name().clone(), self.len()) .into_datetime(self.time_unit(), self.time_zone().clone())); } } @@ -112,7 +112,7 @@ impl PolarsRound for DateChunked { ) }) } else { - Ok(Int32Chunked::full_null(self.name(), self.len())) + Ok(Int32Chunked::full_null(self.name().clone(), self.len())) } }, _ => broadcast_try_binary_elementwise(self, every, |opt_t, opt_every| { diff --git a/crates/polars-time/src/truncate.rs b/crates/polars-time/src/truncate.rs index d3c74420252f..0548911e0fbf 100644 --- a/crates/polars-time/src/truncate.rs +++ b/crates/polars-time/src/truncate.rs @@ -59,7 +59,7 @@ impl PolarsTruncate for DatetimeChunked { return Ok(out?.into_datetime(self.time_unit(), self.time_zone().clone())); } } else { - return Ok(Int64Chunked::full_null(self.name(), self.len()) + return Ok(Int64Chunked::full_null(self.name().clone(), self.len()) .into_datetime(self.time_unit(), self.time_zone().clone())); } } @@ -110,7 +110,7 @@ impl PolarsTruncate for DateChunked { / MILLISECONDS_IN_DAY) as i32) }) } else { - Ok(Int32Chunked::full_null(self.name(), self.len())) + Ok(Int32Chunked::full_null(self.name().clone(), self.len())) } }, _ => broadcast_try_binary_elementwise(self, every, |opt_t, opt_every| { diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs index 235ec383fbc8..27ed3e7f127a 100644 --- a/crates/polars-time/src/upsample.rs +++ b/crates/polars-time/src/upsample.rs @@ -38,7 +38,7 @@ pub trait PolarsUpsample { /// day (which may not be 24 hours, depending on daylight savings). /// Similarly for "calendar week", "calendar month", "calendar quarter", /// and "calendar year". - fn upsample>( + fn upsample>( &self, by: I, time_column: &str, @@ -79,7 +79,7 @@ pub trait PolarsUpsample { /// day (which may not be 24 hours, depending on daylight savings). /// Similarly for "calendar week", "calendar month", "calendar quarter", /// and "calendar year". - fn upsample_stable>( + fn upsample_stable>( &self, by: I, time_column: &str, @@ -88,7 +88,7 @@ pub trait PolarsUpsample { } impl PolarsUpsample for DataFrame { - fn upsample>( + fn upsample>( &self, by: I, time_column: &str, @@ -100,7 +100,7 @@ impl PolarsUpsample for DataFrame { upsample_impl(self, by, time_column, every, false) } - fn upsample_stable>( + fn upsample_stable>( &self, by: I, time_column: &str, @@ -115,7 +115,7 @@ impl PolarsUpsample for DataFrame { fn upsample_impl( source: &DataFrame, - by: Vec, + by: Vec, index_column: &str, every: Duration, stable: bool, @@ -201,7 +201,7 @@ fn upsample_single_impl( _ => None, }; let range = datetime_range_impl( - index_col_name, + index_col_name.clone(), first, last, every, @@ -213,8 +213,8 @@ fn upsample_single_impl( .into_frame(); range.join( source, - &[index_col_name], - &[index_col_name], + [index_col_name], + [index_col_name], JoinArgs::new(JoinType::Left), ) }, diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index 6e2ac16c6e85..d2041e33f7be 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -23,7 +23,6 @@ once_cell = { workspace = true } raw-cpuid = { workspace = true } rayon = { workspace = true } serde = { workspace = true, optional = true } -smartstring = { workspace = true } stacker = { workspace = true } sysinfo = { version = "0.31", default-features = false, features = ["system"], optional = true } @@ -38,3 +37,4 @@ mmap = ["memmap"] bigidx = [] nightly = [] ir_serde = ["serde"] +serde = ["dep:serde", "serde/derive"] diff --git a/crates/polars-utils/src/fmt.rs b/crates/polars-utils/src/fmt.rs index dc34490858c0..797c1a45d020 100644 --- a/crates/polars-utils/src/fmt.rs +++ b/crates/polars-utils/src/fmt.rs @@ -1,15 +1,3 @@ -#[macro_export] -macro_rules! format_smartstring { - ($($arg:tt)*) => {{ - use smartstring::alias::String as SmartString; - use std::fmt::Write; - - let mut string = SmartString::new(); - write!(string, $($arg)*).unwrap(); - string - }} -} - #[macro_export] macro_rules! format_list_container { ($e:expr, $start:tt, $end:tt) => {{ diff --git a/crates/polars-utils/src/lib.rs b/crates/polars-utils/src/lib.rs index 0e7a1b4c6858..68e331973800 100644 --- a/crates/polars-utils/src/lib.rs +++ b/crates/polars-utils/src/lib.rs @@ -20,6 +20,7 @@ pub mod hashing; pub mod idx_vec; pub mod mem; pub mod min_max; +pub mod pl_str; pub mod priority; pub mod slice; pub mod sort; diff --git a/crates/polars-utils/src/pl_str.rs b/crates/polars-utils/src/pl_str.rs new file mode 100644 index 000000000000..b1d7277c423d --- /dev/null +++ b/crates/polars-utils/src/pl_str.rs @@ -0,0 +1,182 @@ +use std::sync::Arc; + +use once_cell::sync::Lazy; + +#[macro_export] +macro_rules! format_pl_smallstr { + ($($arg:tt)*) => {{ + use std::fmt::Write; + + let mut string = String::new(); + write!(string, $($arg)*).unwrap(); + PlSmallStr::from_string(string) + }} +} + +/// String type that interns small strings and has O(1) clone. +#[derive(Clone, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct PlSmallStr(Arc); + +impl PlSmallStr { + /// Initialize an empty string "" + /// TODO: make this a `const fn` + #[inline(always)] + pub fn const_default() -> Self { + Self::empty_static().clone() + } + + /// This is a workaround until `const_default` becomes a const fn + #[inline(always)] + pub fn empty_static() -> &'static Self { + static THIS: Lazy = Lazy::new(|| PlSmallStr::from_static("")); + &THIS + } + + /// TODO: make this a `const fn` + #[inline(always)] + pub fn from_static(s: &'static str) -> Self { + Self(Arc::from(s)) + } + + #[inline(always)] + #[allow(clippy::should_implement_trait)] + pub fn from_str(s: &str) -> Self { + Self(Arc::from(s)) + } + + #[inline(always)] + pub fn from_string(s: String) -> Self { + Self(Arc::from(s)) + } + + #[inline(always)] + pub fn as_str(&self) -> &str { + self.0.as_ref() + } + + #[inline(always)] + pub fn into_string(self) -> String { + self.0.to_string() + } +} + +impl Default for PlSmallStr { + #[inline(always)] + fn default() -> Self { + Self::const_default() + } +} + +impl AsRef for PlSmallStr { + #[inline(always)] + fn as_ref(&self) -> &str { + self.0.as_ref() + } +} + +impl PartialEq for PlSmallStr +where + T: AsRef + ?Sized, +{ + fn eq(&self, other: &T) -> bool { + self.as_str() == other.as_ref() + } +} + +impl PartialEq for &str { + fn eq(&self, other: &PlSmallStr) -> bool { + *self == other.as_str() + } +} + +impl PartialEq for &&str { + fn eq(&self, other: &PlSmallStr) -> bool { + **self == other.as_str() + } +} + +impl PartialEq for String { + fn eq(&self, other: &PlSmallStr) -> bool { + self.as_str() == other.as_str() + } +} + +impl PartialEq for &String { + fn eq(&self, other: &PlSmallStr) -> bool { + self.as_str() == other.as_str() + } +} + +impl core::fmt::Debug for PlSmallStr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +impl core::ops::Deref for PlSmallStr { + type Target = str; + + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl core::borrow::Borrow for PlSmallStr { + #[inline(always)] + fn borrow(&self) -> &str { + self.0.as_ref() + } +} + +/// We implement `From<&PlSmallStr>` to support `&[S] where S: Into`, +/// prefer calling `.clone()` for existing `PlSmallStr`s where possible. +impl From<&PlSmallStr> for PlSmallStr { + #[inline(always)] + fn from(value: &PlSmallStr) -> Self { + value.clone() + } +} + +impl From<&&PlSmallStr> for PlSmallStr { + #[inline(always)] + fn from(value: &&PlSmallStr) -> Self { + (*value).clone() + } +} + +impl From<&str> for PlSmallStr { + #[inline(always)] + fn from(value: &str) -> Self { + Self::from_str(value) + } +} + +impl From<&&str> for PlSmallStr { + #[inline(always)] + fn from(value: &&str) -> Self { + Self::from_str(value) + } +} + +impl From for PlSmallStr { + #[inline(always)] + fn from(value: String) -> Self { + Self::from_string(value) + } +} + +impl From<&String> for PlSmallStr { + #[inline(always)] + fn from(value: &String) -> Self { + Self::from_str(value.as_str()) + } +} + +impl core::fmt::Display for PlSmallStr { + #[inline(always)] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 196f1832ada4..6742e090bc39 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -65,13 +65,14 @@ default = [ ] ndarray = ["polars-core/ndarray"] # serde support for dataframes and series -serde = ["polars-core/serde"] +serde = ["polars-core/serde", "polars-utils/serde"] serde-lazy = [ "polars-core/serde-lazy", "polars-lazy?/serde", "polars-time?/serde", "polars-io?/serde", "polars-ops?/serde", + "polars-utils/serde", ] parquet = ["polars-io", "polars-lazy?/parquet", "polars-io/parquet", "polars-sql?/parquet"] async = ["polars-lazy?/async"] diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index 8faf05c0a96c..95c759f836e7 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -48,10 +48,10 @@ //! let ca: UInt32Chunked = (0..10).map(Some).collect(); //! //! // from slices -//! let ca = UInt32Chunked::new("foo", &[1, 2, 3]); +//! let ca = UInt32Chunked::new("foo".into(), &[1, 2, 3]); //! //! // use builders -//! let mut builder = PrimitiveChunkedBuilder::::new("foo", 10); +//! let mut builder = PrimitiveChunkedBuilder::::new("foo".into(), 10); //! for value in 0..10 { //! builder.append_value(value); //! } @@ -67,10 +67,10 @@ //! let s: Series = (0..10).map(Some).collect(); //! //! // from slices -//! let s = Series::new("foo", &[1, 2, 3]); +//! let s = Series::new("foo".into(), &[1, 2, 3]); //! //! // from a chunked-array -//! let ca = UInt32Chunked::new("foo", &[Some(1), None, Some(3)]); +//! let ca = UInt32Chunked::new("foo".into(), &[Some(1), None, Some(3)]); //! let s = ca.into_series(); //! ``` //! @@ -89,8 +89,8 @@ //! ]?; //! //! // from a Vec -//! let s1 = Series::new("names", &["a", "b", "c"]); -//! let s2 = Series::new("values", &[Some(1), None, Some(3)]); +//! let s1 = Series::new("names".into(), &["a", "b", "c"]); +//! let s2 = Series::new("values".into(), &[Some(1), None, Some(3)]); //! let df = DataFrame::new(vec![s1, s2])?; //! # Ok(()) //! # } @@ -103,8 +103,8 @@ //! ``` //! use polars::prelude::*; //! # fn example() -> PolarsResult<()> { -//! let s_int = Series::new("a", &[1, 2, 3]); -//! let s_flt = Series::new("b", &[1.0, 2.0, 3.0]); +//! let s_int = Series::new("a".into(), &[1, 2, 3]); +//! let s_flt = Series::new("b".into(), &[1.0, 2.0, 3.0]); //! //! let added = &s_int + &s_flt; //! let subtracted = &s_int - &s_flt; @@ -125,7 +125,7 @@ //! let multiplied = s_flt * 2.0; //! //! // or broadcast Series to match the operands type -//! let added = &s_int * &Series::new("broadcast_me", &[10]); +//! let added = &s_int * &Series::new("broadcast_me".into(), &[10]); //! //! # Ok(()) //! # } @@ -136,7 +136,7 @@ //! //! ```rust //! # use polars::prelude::*; -//! let series = Series::new("foo", [1, 2, 3]); +//! let series = Series::new("foo".into(), [1, 2, 3]); //! //! // 1 / s //! let divide_one_by_s = 1.div(&series); @@ -151,7 +151,7 @@ //! //! ```rust //! # use polars::prelude::*; -//! let ca = UInt32Chunked::new("foo", &[1, 2, 3]); +//! let ca = UInt32Chunked::new("foo".into(), &[1, 2, 3]); //! //! // 1 / ca //! let divide_one_by_ca = ca.apply_values(|rhs| 1 / rhs); @@ -165,8 +165,8 @@ //! use polars::prelude::*; //! # fn example() -> PolarsResult<()> { //! -//! let s = Series::new("a", &[1, 2, 3]); -//! let ca = UInt32Chunked::new("b", &[Some(3), None, Some(1)]); +//! let s = Series::new("a".into(), &[1, 2, 3]); +//! let ca = UInt32Chunked::new("b".into(), &[Some(3), None, Some(1)]); //! //! // compare Series with numeric values //! // == @@ -251,11 +251,11 @@ //! # fn example() -> PolarsResult<()> { //! //! // apply a closure over all values -//! let s = Series::new("foo", &[Some(1), Some(2), None]); +//! let s = Series::new("foo".into(), &[Some(1), Some(2), None]); //! s.i32()?.apply_values(|value| value * 20); //! //! // count string lengths -//! let s = Series::new("foo", &["foo", "bar", "foobar"]); +//! let s = Series::new("foo".into(), &["foo", "bar", "foobar"]); //! unary_elementwise_values(s.str()?, |str_val| str_val.len() as u64); //! //! # Ok(()) @@ -506,15 +506,15 @@ //! use polars::df; //! //! # fn example(df: &DataFrame) -> PolarsResult<()> { -//! let s0 = Series::new("a", &[1i64, 2, 3]); -//! let s1 = Series::new("b", &[1i64, 1, 1]); -//! let s2 = Series::new("c", &[2i64, 2, 2]); +//! let s0 = Series::new("a".into(), &[1i64, 2, 3]); +//! let s1 = Series::new("b".into(), &[1i64, 1, 1]); +//! let s2 = Series::new("c".into(), &[2i64, 2, 2]); //! // construct a new ListChunked for a slice of Series. //! let list = Series::new("foo", &[s0, s1, s2]); //! //! // construct a few more Series. -//! let s0 = Series::new("B", [1, 2, 3]); -//! let s1 = Series::new("C", [1, 1, 1]); +//! let s0 = Series::new("B".into(), [1, 2, 3]); +//! let s1 = Series::new("C".into(), [1, 1, 1]); //! let df = DataFrame::new(vec![list, s0, s1])?; //! //! let exploded = df.explode(["foo"])?; diff --git a/crates/polars/tests/it/arrow/array/fixed_size_binary/mod.rs b/crates/polars/tests/it/arrow/array/fixed_size_binary/mod.rs index 12019be64205..5e5a6a37b2c9 100644 --- a/crates/polars/tests/it/arrow/array/fixed_size_binary/mod.rs +++ b/crates/polars/tests/it/arrow/array/fixed_size_binary/mod.rs @@ -95,7 +95,7 @@ fn to() { let a = FixedSizeBinaryArray::new(ArrowDataType::FixedSizeBinary(2), values, None); let extension = ArrowDataType::Extension( - "a".to_string(), + "a".into(), Box::new(ArrowDataType::FixedSizeBinary(2)), None, ); diff --git a/crates/polars/tests/it/arrow/array/fixed_size_list/mod.rs b/crates/polars/tests/it/arrow/array/fixed_size_list/mod.rs index 5b8ea7b8c950..924a12c87168 100644 --- a/crates/polars/tests/it/arrow/array/fixed_size_list/mod.rs +++ b/crates/polars/tests/it/arrow/array/fixed_size_list/mod.rs @@ -9,7 +9,7 @@ fn data() -> FixedSizeListArray { FixedSizeListArray::try_new( ArrowDataType::FixedSizeList( - Box::new(Field::new("a", values.data_type().clone(), true)), + Box::new(Field::new("a".into(), values.data_type().clone(), true)), 2, ), values.boxed(), @@ -59,7 +59,7 @@ fn debug() { #[test] fn empty() { let array = FixedSizeListArray::new_empty(ArrowDataType::FixedSizeList( - Box::new(Field::new("a", ArrowDataType::Int32, true)), + Box::new(Field::new("a".into(), ArrowDataType::Int32, true)), 2, )); assert_eq!(array.values().len(), 0); @@ -69,7 +69,10 @@ fn empty() { #[test] fn null() { let array = FixedSizeListArray::new_null( - ArrowDataType::FixedSizeList(Box::new(Field::new("a", ArrowDataType::Int32, true)), 2), + ArrowDataType::FixedSizeList( + Box::new(Field::new("a".into(), ArrowDataType::Int32, true)), + 2, + ), 2, ); assert_eq!(array.values().len(), 4); @@ -80,7 +83,10 @@ fn null() { fn wrong_size() { let values = Int32Array::from_slice([10, 20, 0]); assert!(FixedSizeListArray::try_new( - ArrowDataType::FixedSizeList(Box::new(Field::new("a", ArrowDataType::Int32, true)), 2), + ArrowDataType::FixedSizeList( + Box::new(Field::new("a".into(), ArrowDataType::Int32, true)), + 2 + ), values.boxed(), None ) @@ -91,7 +97,10 @@ fn wrong_size() { fn wrong_len() { let values = Int32Array::from_slice([10, 20, 0]); assert!(FixedSizeListArray::try_new( - ArrowDataType::FixedSizeList(Box::new(Field::new("a", ArrowDataType::Int32, true)), 2), + ArrowDataType::FixedSizeList( + Box::new(Field::new("a".into(), ArrowDataType::Int32, true)), + 2 + ), values.boxed(), Some([true, false, false].into()), // it should be 2 ) diff --git a/crates/polars/tests/it/arrow/array/fixed_size_list/mutable.rs b/crates/polars/tests/it/arrow/array/fixed_size_list/mutable.rs index 23ea53231059..4020984643de 100644 --- a/crates/polars/tests/it/arrow/array/fixed_size_list/mutable.rs +++ b/crates/polars/tests/it/arrow/array/fixed_size_list/mutable.rs @@ -36,7 +36,7 @@ fn new_with_field() { let mut list = MutableFixedSizeListArray::new_with_field( MutablePrimitiveArray::::new(), - "custom_items", + "custom_items".into(), false, 3, ); @@ -46,7 +46,11 @@ fn new_with_field() { assert_eq!( list.data_type(), &ArrowDataType::FixedSizeList( - Box::new(Field::new("custom_items", ArrowDataType::Int32, false)), + Box::new(Field::new( + "custom_items".into(), + ArrowDataType::Int32, + false + )), 3 ) ); diff --git a/crates/polars/tests/it/arrow/array/growable/list.rs b/crates/polars/tests/it/arrow/array/growable/list.rs index 1bc0985ceb4f..c9781f3adf5f 100644 --- a/crates/polars/tests/it/arrow/array/growable/list.rs +++ b/crates/polars/tests/it/arrow/array/growable/list.rs @@ -19,7 +19,7 @@ fn extension() { let array = create_list_array(data); let data_type = - ArrowDataType::Extension("ext".to_owned(), Box::new(array.data_type().clone()), None); + ArrowDataType::Extension("ext".into(), Box::new(array.data_type().clone()), None); let array_ext = ListArray::new( data_type, array.offsets().clone(), diff --git a/crates/polars/tests/it/arrow/array/growable/mod.rs b/crates/polars/tests/it/arrow/array/growable/mod.rs index 63299b445eb8..32a557a05141 100644 --- a/crates/polars/tests/it/arrow/array/growable/mod.rs +++ b/crates/polars/tests/it/arrow/array/growable/mod.rs @@ -44,16 +44,15 @@ fn test_make_growable_extension() { .unwrap(); make_growable(&[&array], false, 2); - let data_type = - ArrowDataType::Extension("ext".to_owned(), Box::new(ArrowDataType::Int32), None); + let data_type = ArrowDataType::Extension("ext".into(), Box::new(ArrowDataType::Int32), None); let array = Int32Array::from_slice([1, 2]).to(data_type.clone()); let array_grown = make_growable(&[&array], false, 2).as_box(); assert_eq!(array_grown.data_type(), &data_type); let data_type = ArrowDataType::Extension( - "ext".to_owned(), + "ext".into(), Box::new(ArrowDataType::Struct(vec![Field::new( - "a", + "a".into(), ArrowDataType::Int32, false, )])), diff --git a/crates/polars/tests/it/arrow/array/growable/struct_.rs b/crates/polars/tests/it/arrow/array/growable/struct_.rs index e410522b92bf..07f0403ee294 100644 --- a/crates/polars/tests/it/arrow/array/growable/struct_.rs +++ b/crates/polars/tests/it/arrow/array/growable/struct_.rs @@ -19,8 +19,8 @@ fn some_values() -> (ArrowDataType, Vec>) { Some(5), ])); let fields = vec![ - Field::new("f1", ArrowDataType::Utf8View, true), - Field::new("f2", ArrowDataType::Int32, true), + Field::new("f1".into(), ArrowDataType::Utf8View, true), + Field::new("f2".into(), ArrowDataType::Int32, true), ]; (ArrowDataType::Struct(fields), vec![strings, ints]) } diff --git a/crates/polars/tests/it/arrow/array/map/mod.rs b/crates/polars/tests/it/arrow/array/map/mod.rs index a9e0fa62b317..c4aab8c2a8cb 100644 --- a/crates/polars/tests/it/arrow/array/map/mod.rs +++ b/crates/polars/tests/it/arrow/array/map/mod.rs @@ -3,13 +3,13 @@ use arrow::datatypes::{ArrowDataType, Field}; fn dt() -> ArrowDataType { ArrowDataType::Struct(vec![ - Field::new("a", ArrowDataType::Utf8, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Utf8, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]) } fn array() -> MapArray { - let data_type = ArrowDataType::Map(Box::new(Field::new("a", dt(), true)), false); + let data_type = ArrowDataType::Map(Box::new(Field::new("a".into(), dt(), true)), false); let field = StructArray::new( dt(), diff --git a/crates/polars/tests/it/arrow/array/mod.rs b/crates/polars/tests/it/arrow/array/mod.rs index 2dcb32ea6708..91c3759cf3a5 100644 --- a/crates/polars/tests/it/arrow/array/mod.rs +++ b/crates/polars/tests/it/arrow/array/mod.rs @@ -24,7 +24,11 @@ fn nulls() { ArrowDataType::Float64, ArrowDataType::Utf8, ArrowDataType::Binary, - ArrowDataType::List(Box::new(Field::new("a", ArrowDataType::Binary, true))), + ArrowDataType::List(Box::new(Field::new( + "a".into(), + ArrowDataType::Binary, + true, + ))), ]; let a = datatypes .into_iter() @@ -34,12 +38,12 @@ fn nulls() { // unions' null count is always 0 let datatypes = vec![ ArrowDataType::Union( - vec![Field::new("a", ArrowDataType::Binary, true)], + vec![Field::new("a".into(), ArrowDataType::Binary, true)], None, UnionMode::Dense, ), ArrowDataType::Union( - vec![Field::new("a", ArrowDataType::Binary, true)], + vec![Field::new("a".into(), ArrowDataType::Binary, true)], None, UnionMode::Sparse, ), @@ -57,23 +61,27 @@ fn empty() { ArrowDataType::Float64, ArrowDataType::Utf8, ArrowDataType::Binary, - ArrowDataType::List(Box::new(Field::new("a", ArrowDataType::Binary, true))), ArrowDataType::List(Box::new(Field::new( - "a", - ArrowDataType::Extension("ext".to_owned(), Box::new(ArrowDataType::Int32), None), + "a".into(), + ArrowDataType::Binary, + true, + ))), + ArrowDataType::List(Box::new(Field::new( + "a".into(), + ArrowDataType::Extension("ext".into(), Box::new(ArrowDataType::Int32), None), true, ))), ArrowDataType::Union( - vec![Field::new("a", ArrowDataType::Binary, true)], + vec![Field::new("a".into(), ArrowDataType::Binary, true)], None, UnionMode::Sparse, ), ArrowDataType::Union( - vec![Field::new("a", ArrowDataType::Binary, true)], + vec![Field::new("a".into(), ArrowDataType::Binary, true)], None, UnionMode::Dense, ), - ArrowDataType::Struct(vec![Field::new("a", ArrowDataType::Int32, true)]), + ArrowDataType::Struct(vec![Field::new("a".into(), ArrowDataType::Int32, true)]), ]; let a = datatypes.into_iter().all(|x| new_empty_array(x).len() == 0); assert!(a); @@ -86,22 +94,26 @@ fn empty_extension() { ArrowDataType::Float64, ArrowDataType::Utf8, ArrowDataType::Binary, - ArrowDataType::List(Box::new(Field::new("a", ArrowDataType::Binary, true))), + ArrowDataType::List(Box::new(Field::new( + "a".into(), + ArrowDataType::Binary, + true, + ))), ArrowDataType::Union( - vec![Field::new("a", ArrowDataType::Binary, true)], + vec![Field::new("a".into(), ArrowDataType::Binary, true)], None, UnionMode::Sparse, ), ArrowDataType::Union( - vec![Field::new("a", ArrowDataType::Binary, true)], + vec![Field::new("a".into(), ArrowDataType::Binary, true)], None, UnionMode::Dense, ), - ArrowDataType::Struct(vec![Field::new("a", ArrowDataType::Int32, true)]), + ArrowDataType::Struct(vec![Field::new("a".into(), ArrowDataType::Int32, true)]), ]; let a = datatypes .into_iter() - .map(|dt| ArrowDataType::Extension("ext".to_owned(), Box::new(dt), None)) + .map(|dt| ArrowDataType::Extension("ext".into(), Box::new(dt), None)) .all(|x| { let a = new_empty_array(x); a.len() == 0 && matches!(a.data_type(), ArrowDataType::Extension(_, _, _)) @@ -116,7 +128,11 @@ fn test_clone() { ArrowDataType::Float64, ArrowDataType::Utf8, ArrowDataType::Binary, - ArrowDataType::List(Box::new(Field::new("a", ArrowDataType::Binary, true))), + ArrowDataType::List(Box::new(Field::new( + "a".into(), + ArrowDataType::Binary, + true, + ))), ]; let a = datatypes .into_iter() diff --git a/crates/polars/tests/it/arrow/array/primitive/fmt.rs b/crates/polars/tests/it/arrow/array/primitive/fmt.rs index e670bc93fe7b..eb6c067b9ec2 100644 --- a/crates/polars/tests/it/arrow/array/primitive/fmt.rs +++ b/crates/polars/tests/it/arrow/array/primitive/fmt.rs @@ -117,7 +117,7 @@ fn debug_timestamp_ns() { fn debug_timestamp_tz_ns() { let array = Int64Array::from(&[Some(1), None, Some(2)]).to(ArrowDataType::Timestamp( TimeUnit::Nanosecond, - Some("+02:00".to_string()), + Some("+02:00".into()), )); assert_eq!( format!("{array:?}"), @@ -129,7 +129,7 @@ fn debug_timestamp_tz_ns() { fn debug_timestamp_tz_not_parsable() { let array = Int64Array::from(&[Some(1), None, Some(2)]).to(ArrowDataType::Timestamp( TimeUnit::Nanosecond, - Some("aa".to_string()), + Some("aa".into()), )); assert_eq!( format!("{array:?}"), @@ -142,7 +142,7 @@ fn debug_timestamp_tz_not_parsable() { fn debug_timestamp_tz1_ns() { let array = Int64Array::from(&[Some(1), None, Some(2)]).to(ArrowDataType::Timestamp( TimeUnit::Nanosecond, - Some("Europe/Lisbon".to_string()), + Some("Europe/Lisbon".into()), )); assert_eq!( format!("{array:?}"), diff --git a/crates/polars/tests/it/arrow/array/struct_/iterator.rs b/crates/polars/tests/it/arrow/array/struct_/iterator.rs index 5b4b0b784d13..e4b6a7691ad0 100644 --- a/crates/polars/tests/it/arrow/array/struct_/iterator.rs +++ b/crates/polars/tests/it/arrow/array/struct_/iterator.rs @@ -8,8 +8,8 @@ fn test_simple_iter() { let int = Int32Array::from_slice([42, 28, 19, 31]).boxed(); let fields = vec![ - Field::new("b", ArrowDataType::Boolean, false), - Field::new("c", ArrowDataType::Int32, false), + Field::new("b".into(), ArrowDataType::Boolean, false), + Field::new("c".into(), ArrowDataType::Int32, false), ]; let array = StructArray::new( diff --git a/crates/polars/tests/it/arrow/array/struct_/mod.rs b/crates/polars/tests/it/arrow/array/struct_/mod.rs index 5af6556096bc..bd1a1c83086c 100644 --- a/crates/polars/tests/it/arrow/array/struct_/mod.rs +++ b/crates/polars/tests/it/arrow/array/struct_/mod.rs @@ -10,8 +10,8 @@ fn array() -> StructArray { let int = Int32Array::from_slice([42, 28, 19, 31]).boxed(); let fields = vec![ - Field::new("b", ArrowDataType::Boolean, false), - Field::new("c", ArrowDataType::Int32, false), + Field::new("b".into(), ArrowDataType::Boolean, false), + Field::new("c".into(), ArrowDataType::Int32, false), ]; StructArray::new( diff --git a/crates/polars/tests/it/arrow/array/struct_/mutable.rs b/crates/polars/tests/it/arrow/array/struct_/mutable.rs index e9d698aa1bb3..32c8b567b8e6 100644 --- a/crates/polars/tests/it/arrow/array/struct_/mutable.rs +++ b/crates/polars/tests/it/arrow/array/struct_/mutable.rs @@ -5,7 +5,8 @@ use arrow::datatypes::{ArrowDataType, Field}; fn push() { let c1 = Box::new(MutablePrimitiveArray::::new()) as Box; let values = vec![c1]; - let data_type = ArrowDataType::Struct(vec![Field::new("f1", ArrowDataType::Int32, true)]); + let data_type = + ArrowDataType::Struct(vec![Field::new("f1".into(), ArrowDataType::Int32, true)]); let mut a = MutableStructArray::new(data_type, values); a.value::>(0) diff --git a/crates/polars/tests/it/arrow/array/union.rs b/crates/polars/tests/it/arrow/array/union.rs index b358aa8e44bb..ef9b24cb10cf 100644 --- a/crates/polars/tests/it/arrow/array/union.rs +++ b/crates/polars/tests/it/arrow/array/union.rs @@ -20,8 +20,8 @@ where #[test] fn sparse_debug() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let types = vec![0, 0, 1].into(); @@ -40,8 +40,8 @@ fn sparse_debug() -> PolarsResult<()> { #[test] fn dense_debug() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Dense); let types = vec![0, 0, 1].into(); @@ -61,8 +61,8 @@ fn dense_debug() -> PolarsResult<()> { #[test] fn slice() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::LargeUtf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::LargeUtf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); @@ -89,8 +89,8 @@ fn slice() -> PolarsResult<()> { #[test] fn iter_sparse() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); @@ -122,8 +122,8 @@ fn iter_sparse() -> PolarsResult<()> { #[test] fn iter_dense() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Dense); let types = Buffer::from(vec![0, 0, 1]); @@ -156,8 +156,8 @@ fn iter_dense() -> PolarsResult<()> { #[test] fn iter_sparse_slice() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); @@ -182,8 +182,8 @@ fn iter_sparse_slice() -> PolarsResult<()> { #[test] fn iter_dense_slice() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Dense); let types = Buffer::from(vec![0, 0, 1]); @@ -209,8 +209,8 @@ fn iter_dense_slice() -> PolarsResult<()> { #[test] fn scalar() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Dense); let types = Buffer::from(vec![0, 0, 1]); @@ -266,8 +266,8 @@ fn scalar() -> PolarsResult<()> { #[test] fn dense_without_offsets_is_error() { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Dense); let types = vec![0, 0, 1].into(); @@ -282,8 +282,8 @@ fn dense_without_offsets_is_error() { #[test] fn fields_must_match() { let fields = vec![ - Field::new("a", ArrowDataType::Int64, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int64, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let types = vec![0, 0, 1].into(); @@ -298,8 +298,8 @@ fn fields_must_match() { #[test] fn sparse_with_offsets_is_error() { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let fields = vec![ @@ -316,8 +316,8 @@ fn sparse_with_offsets_is_error() { #[test] fn offsets_must_be_in_bounds() { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let fields = vec![ @@ -335,8 +335,8 @@ fn offsets_must_be_in_bounds() { #[test] fn sparse_with_wrong_offsets1_is_error() { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let fields = vec![ @@ -354,8 +354,8 @@ fn sparse_with_wrong_offsets1_is_error() { #[test] fn types_must_be_in_bounds() -> PolarsResult<()> { let fields = vec![ - Field::new("a", ArrowDataType::Int32, true), - Field::new("b", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int32, true), + Field::new("b".into(), ArrowDataType::Utf8, true), ]; let data_type = ArrowDataType::Union(fields, None, UnionMode::Sparse); let fields = vec![ diff --git a/crates/polars/tests/it/arrow/compute/aggregate/memory.rs b/crates/polars/tests/it/arrow/compute/aggregate/memory.rs index 3f31240b8602..1850f4e6bb2d 100644 --- a/crates/polars/tests/it/arrow/compute/aggregate/memory.rs +++ b/crates/polars/tests/it/arrow/compute/aggregate/memory.rs @@ -23,7 +23,7 @@ fn utf8() { #[test] fn fixed_size_list() { let data_type = ArrowDataType::FixedSizeList( - Box::new(Field::new("elem", ArrowDataType::Float32, false)), + Box::new(Field::new("elem".into(), ArrowDataType::Float32, false)), 3, ); let values = Box::new(Float32Array::from_slice([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])); diff --git a/crates/polars/tests/it/arrow/ffi/data.rs b/crates/polars/tests/it/arrow/ffi/data.rs index bb798a1bc4fc..9dc194acc471 100644 --- a/crates/polars/tests/it/arrow/ffi/data.rs +++ b/crates/polars/tests/it/arrow/ffi/data.rs @@ -4,7 +4,7 @@ use arrow::ffi; use polars_error::PolarsResult; fn _test_round_trip(array: Box, expected: Box) -> PolarsResult<()> { - let field = Field::new("a", array.data_type().clone(), true); + let field = Field::new("a".into(), array.data_type().clone(), true); // export array and corresponding data_type let array_ffi = ffi::export_array_to_c(array); diff --git a/crates/polars/tests/it/arrow/ffi/stream.rs b/crates/polars/tests/it/arrow/ffi/stream.rs index f949fdf4c88e..82f66bc26c15 100644 --- a/crates/polars/tests/it/arrow/ffi/stream.rs +++ b/crates/polars/tests/it/arrow/ffi/stream.rs @@ -4,7 +4,7 @@ use arrow::ffi; use polars_error::{PolarsError, PolarsResult}; fn _test_round_trip(arrays: Vec>) -> PolarsResult<()> { - let field = Field::new("a", arrays[0].data_type().clone(), true); + let field = Field::new("a".into(), arrays[0].data_type().clone(), true); let iter = Box::new(arrays.clone().into_iter().map(Ok)) as _; let mut stream = Box::new(ffi::ArrowArrayStream::empty()); diff --git a/crates/polars/tests/it/arrow/io/ipc/mod.rs b/crates/polars/tests/it/arrow/io/ipc/mod.rs index c5b622305d42..3575b2c642a6 100644 --- a/crates/polars/tests/it/arrow/io/ipc/mod.rs +++ b/crates/polars/tests/it/arrow/io/ipc/mod.rs @@ -49,7 +49,7 @@ fn round_trip( } fn prep_schema(array: &dyn Array) -> ArrowSchemaRef { - let fields = vec![Field::new("a", array.data_type().clone(), true)]; + let fields = vec![Field::new("a".into(), array.data_type().clone(), true)]; Arc::new(ArrowSchema::from(fields)) } diff --git a/crates/polars/tests/it/arrow/scalar/fixed_size_list.rs b/crates/polars/tests/it/arrow/scalar/fixed_size_list.rs index 2aa6f45bbd74..79c0fead92a5 100644 --- a/crates/polars/tests/it/arrow/scalar/fixed_size_list.rs +++ b/crates/polars/tests/it/arrow/scalar/fixed_size_list.rs @@ -5,8 +5,10 @@ use arrow::scalar::{FixedSizeListScalar, Scalar}; #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = - ArrowDataType::FixedSizeList(Box::new(Field::new("a", ArrowDataType::Boolean, true)), 2); + let dt = ArrowDataType::FixedSizeList( + Box::new(Field::new("a".into(), ArrowDataType::Boolean, true)), + 2, + ); let a = FixedSizeListScalar::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), @@ -25,8 +27,10 @@ fn equal() { #[test] fn basics() { - let dt = - ArrowDataType::FixedSizeList(Box::new(Field::new("a", ArrowDataType::Boolean, true)), 2); + let dt = ArrowDataType::FixedSizeList( + Box::new(Field::new("a".into(), ArrowDataType::Boolean, true)), + 2, + ); let a = FixedSizeListScalar::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), diff --git a/crates/polars/tests/it/arrow/scalar/list.rs b/crates/polars/tests/it/arrow/scalar/list.rs index 7cd2938237c9..653321bdf187 100644 --- a/crates/polars/tests/it/arrow/scalar/list.rs +++ b/crates/polars/tests/it/arrow/scalar/list.rs @@ -5,7 +5,11 @@ use arrow::scalar::{ListScalar, Scalar}; #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = ArrowDataType::List(Box::new(Field::new("a", ArrowDataType::Boolean, true))); + let dt = ArrowDataType::List(Box::new(Field::new( + "a".into(), + ArrowDataType::Boolean, + true, + ))); let a = ListScalar::::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), @@ -21,7 +25,11 @@ fn equal() { #[test] fn basics() { - let dt = ArrowDataType::List(Box::new(Field::new("a", ArrowDataType::Boolean, true))); + let dt = ArrowDataType::List(Box::new(Field::new( + "a".into(), + ArrowDataType::Boolean, + true, + ))); let a = ListScalar::::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), diff --git a/crates/polars/tests/it/arrow/scalar/map.rs b/crates/polars/tests/it/arrow/scalar/map.rs index e9f0ede0784f..1e6328eacd82 100644 --- a/crates/polars/tests/it/arrow/scalar/map.rs +++ b/crates/polars/tests/it/arrow/scalar/map.rs @@ -6,8 +6,8 @@ use arrow::scalar::{MapScalar, Scalar}; #[test] fn equal() { let kv_dt = ArrowDataType::Struct(vec![ - Field::new("key", ArrowDataType::Utf8, false), - Field::new("value", ArrowDataType::Boolean, true), + Field::new("key".into(), ArrowDataType::Utf8, false), + Field::new("value".into(), ArrowDataType::Boolean, true), ]); let kv_array1 = StructArray::try_new( kv_dt.clone(), @@ -28,7 +28,7 @@ fn equal() { ) .unwrap(); - let dt = ArrowDataType::Map(Box::new(Field::new("entries", kv_dt, true)), false); + let dt = ArrowDataType::Map(Box::new(Field::new("entries".into(), kv_dt, true)), false); let a = MapScalar::new(dt.clone(), Some(Box::new(kv_array1))); let b = MapScalar::new(dt.clone(), None); assert_eq!(a, a); @@ -42,8 +42,8 @@ fn equal() { #[test] fn basics() { let kv_dt = ArrowDataType::Struct(vec![ - Field::new("key", ArrowDataType::Utf8, false), - Field::new("value", ArrowDataType::Boolean, true), + Field::new("key".into(), ArrowDataType::Utf8, false), + Field::new("value".into(), ArrowDataType::Boolean, true), ]); let kv_array = StructArray::try_new( kv_dt.clone(), @@ -55,7 +55,7 @@ fn basics() { ) .unwrap(); - let dt = ArrowDataType::Map(Box::new(Field::new("entries", kv_dt, true)), false); + let dt = ArrowDataType::Map(Box::new(Field::new("entries".into(), kv_dt, true)), false); let a = MapScalar::new(dt.clone(), Some(Box::new(kv_array.clone()))); assert_eq!(kv_array, a.values().as_ref()); diff --git a/crates/polars/tests/it/arrow/scalar/struct_.rs b/crates/polars/tests/it/arrow/scalar/struct_.rs index 23461bb26568..0bd85620ba07 100644 --- a/crates/polars/tests/it/arrow/scalar/struct_.rs +++ b/crates/polars/tests/it/arrow/scalar/struct_.rs @@ -4,7 +4,7 @@ use arrow::scalar::{BooleanScalar, Scalar, StructScalar}; #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = ArrowDataType::Struct(vec![Field::new("a", ArrowDataType::Boolean, true)]); + let dt = ArrowDataType::Struct(vec![Field::new("a".into(), ArrowDataType::Boolean, true)]); let a = StructScalar::new( dt.clone(), Some(vec![ @@ -27,7 +27,7 @@ fn equal() { #[test] fn basics() { - let dt = ArrowDataType::Struct(vec![Field::new("a", ArrowDataType::Boolean, true)]); + let dt = ArrowDataType::Struct(vec![Field::new("a".into(), ArrowDataType::Boolean, true)]); let values = vec![Box::new(BooleanScalar::from(Some(true))) as Box]; diff --git a/crates/polars/tests/it/chunks/parquet.rs b/crates/polars/tests/it/chunks/parquet.rs index 26c37566845a..384382fdd5f9 100644 --- a/crates/polars/tests/it/chunks/parquet.rs +++ b/crates/polars/tests/it/chunks/parquet.rs @@ -11,7 +11,7 @@ fn test_cast_join_14872() { let mut df2 = df![ "ints" => [0, 1], - "strings" => vec![Series::new("", ["a"]); 2], + "strings" => vec![Series::new("".into(), ["a"]); 2], ] .unwrap(); @@ -30,7 +30,7 @@ fn test_cast_join_14872() { let expected = df![ "ints" => [1], - "strings" => vec![Series::new("", ["a"]); 1], + "strings" => vec![Series::new("".into(), ["a"]); 1], ] .unwrap(); diff --git a/crates/polars/tests/it/core/date_like.rs b/crates/polars/tests/it/core/date_like.rs index df91aa512afc..7777d3fd1eb0 100644 --- a/crates/polars/tests/it/core/date_like.rs +++ b/crates/polars/tests/it/core/date_like.rs @@ -4,9 +4,9 @@ use super::*; #[cfg(feature = "dtype-datetime")] #[cfg_attr(miri, ignore)] fn test_datelike_join() -> PolarsResult<()> { - let s = Series::new("foo", &[1, 2, 3]); + let s = Series::new("foo".into(), &[1, 2, 3]); let mut s1 = s.cast(&DataType::Datetime(TimeUnit::Nanoseconds, None))?; - s1.rename("bar"); + s1.rename("bar".into()); let df = DataFrame::new(vec![s, s1])?; @@ -33,7 +33,7 @@ fn test_datelike_join() -> PolarsResult<()> { #[test] #[cfg(all(feature = "dtype-datetime", feature = "dtype-duration"))] fn test_datelike_methods() -> PolarsResult<()> { - let s = Series::new("foo", &[1, 2, 3]); + let s = Series::new("foo".into(), &[1, 2, 3]); let s = s.cast(&DataType::Datetime(TimeUnit::Nanoseconds, None))?; let out = s.subtract(&s)?; @@ -52,7 +52,7 @@ fn test_datelike_methods() -> PolarsResult<()> { #[test] #[cfg(all(feature = "dtype-datetime", feature = "dtype-duration"))] fn test_arithmetic_dispatch() { - let s = Int64Chunked::new("", &[1, 2, 3]) + let s = Int64Chunked::new("".into(), &[1, 2, 3]) .into_datetime(TimeUnit::Nanoseconds, None) .into_series(); @@ -113,13 +113,13 @@ fn test_arithmetic_dispatch() { #[test] #[cfg(feature = "dtype-duration")] fn test_duration() -> PolarsResult<()> { - let a = Int64Chunked::new("", &[1, 2, 3]) + let a = Int64Chunked::new("".into(), &[1, 2, 3]) .into_datetime(TimeUnit::Nanoseconds, None) .into_series(); - let b = Int64Chunked::new("", &[2, 3, 4]) + let b = Int64Chunked::new("".into(), &[2, 3, 4]) .into_datetime(TimeUnit::Nanoseconds, None) .into_series(); - let c = Int64Chunked::new("", &[1, 1, 1]) + let c = Int64Chunked::new("".into(), &[1, 1, 1]) .into_duration(TimeUnit::Nanoseconds) .into_series(); assert_eq!( @@ -132,7 +132,7 @@ fn test_duration() -> PolarsResult<()> { ); assert_eq!( b.subtract(&a)?, - Int64Chunked::full("", 1, a.len()) + Int64Chunked::full("".into(), 1, a.len()) .into_duration(TimeUnit::Nanoseconds) .into_series() ); @@ -142,8 +142,12 @@ fn test_duration() -> PolarsResult<()> { #[test] #[cfg(feature = "dtype-duration")] fn test_duration_date_arithmetic() -> PolarsResult<()> { - let date1 = Int32Chunked::new("", &[1, 1, 1]).into_date().into_series(); - let date2 = Int32Chunked::new("", &[2, 3, 4]).into_date().into_series(); + let date1 = Int32Chunked::new("".into(), &[1, 1, 1]) + .into_date() + .into_series(); + let date2 = Int32Chunked::new("".into(), &[2, 3, 4]) + .into_date() + .into_series(); let diff_ms = &date2 - &date1; let diff_ms = diff_ms?; diff --git a/crates/polars/tests/it/core/group_by.rs b/crates/polars/tests/it/core/group_by.rs index f14caad753dd..12241bc5b2eb 100644 --- a/crates/polars/tests/it/core/group_by.rs +++ b/crates/polars/tests/it/core/group_by.rs @@ -5,7 +5,10 @@ use super::*; #[test] fn test_sorted_group_by() -> PolarsResult<()> { // nulls last - let mut s = Series::new("a", &[Some(1), Some(1), Some(1), Some(6), Some(6), None]); + let mut s = Series::new( + "a".into(), + &[Some(1), Some(1), Some(1), Some(6), Some(6), None], + ); s.set_sorted_flag(IsSorted::Ascending); for mt in [true, false] { let out = s.group_tuples(mt, false)?; @@ -14,7 +17,7 @@ fn test_sorted_group_by() -> PolarsResult<()> { // nulls first let mut s = Series::new( - "a", + "a".into(), &[None, None, Some(1), Some(1), Some(1), Some(6), Some(6)], ); s.set_sorted_flag(IsSorted::Ascending); @@ -24,7 +27,10 @@ fn test_sorted_group_by() -> PolarsResult<()> { } // nulls last - let mut s = Series::new("a", &[Some(1), Some(1), Some(1), Some(6), Some(6), None]); + let mut s = Series::new( + "a".into(), + &[Some(1), Some(1), Some(1), Some(6), Some(6), None], + ); s.set_sorted_flag(IsSorted::Ascending); for mt in [true, false] { let out = s.group_tuples(mt, false)?; @@ -33,7 +39,7 @@ fn test_sorted_group_by() -> PolarsResult<()> { // nulls first descending sorted let mut s = Series::new( - "a", + "a".into(), &[ None, None, @@ -53,7 +59,7 @@ fn test_sorted_group_by() -> PolarsResult<()> { // nulls last descending sorted let mut s = Series::new( - "a", + "a".into(), &[ Some(15), Some(15), diff --git a/crates/polars/tests/it/core/joins.rs b/crates/polars/tests/it/core/joins.rs index aa5bfc415697..6f8f1758131e 100644 --- a/crates/polars/tests/it/core/joins.rs +++ b/crates/polars/tests/it/core/joins.rs @@ -39,13 +39,13 @@ fn test_chunked_left_join() -> PolarsResult<()> { } fn create_frames() -> (DataFrame, DataFrame) { - let s0 = Series::new("days", &[0, 1, 2]); - let s1 = Series::new("temp", &[22.1, 19.9, 7.]); - let s2 = Series::new("rain", &[0.2, 0.1, 0.3]); + let s0 = Series::new("days".into(), &[0, 1, 2]); + let s1 = Series::new("temp".into(), &[22.1, 19.9, 7.]); + let s2 = Series::new("rain".into(), &[0.2, 0.1, 0.3]); let temp = DataFrame::new(vec![s0, s1, s2]).unwrap(); - let s0 = Series::new("days", &[1, 2, 3, 1]); - let s1 = Series::new("rain", &[0.1, 0.2, 0.3, 0.4]); + let s0 = Series::new("days".into(), &[1, 2, 3, 1]); + let s1 = Series::new("rain".into(), &[0.1, 0.2, 0.3, 0.4]); let rain = DataFrame::new(vec![s0, s1]).unwrap(); (temp, rain) } @@ -59,10 +59,10 @@ fn test_inner_join() { std::env::set_var("POLARS_MAX_THREADS", format!("{}", i)); let joined = temp.inner_join(&rain, ["days"], ["days"]).unwrap(); - let join_col_days = Series::new("days", &[1, 2, 1]); - let join_col_temp = Series::new("temp", &[19.9, 7., 19.9]); - let join_col_rain = Series::new("rain", &[0.1, 0.3, 0.1]); - let join_col_rain_right = Series::new("rain_right", [0.1, 0.2, 0.4].as_ref()); + let join_col_days = Series::new("days".into(), &[1, 2, 1]); + let join_col_temp = Series::new("temp".into(), &[19.9, 7., 19.9]); + let join_col_rain = Series::new("rain".into(), &[0.1, 0.3, 0.1]); + let join_col_rain_right = Series::new("rain_right".into(), [0.1, 0.2, 0.4].as_ref()); let true_df = DataFrame::new(vec![ join_col_days, join_col_temp, @@ -81,12 +81,12 @@ fn test_inner_join() { fn test_left_join() { for i in 1..8 { std::env::set_var("POLARS_MAX_THREADS", format!("{}", i)); - let s0 = Series::new("days", &[0, 1, 2, 3, 4]); - let s1 = Series::new("temp", &[22.1, 19.9, 7., 2., 3.]); + let s0 = Series::new("days".into(), &[0, 1, 2, 3, 4]); + let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); let temp = DataFrame::new(vec![s0, s1]).unwrap(); - let s0 = Series::new("days", &[1, 2]); - let s1 = Series::new("rain", &[0.1, 0.2]); + let s0 = Series::new("days".into(), &[1, 2]); + let s1 = Series::new("rain".into(), &[0.1, 0.2]); let rain = DataFrame::new(vec![s0, s1]).unwrap(); let joined = temp.left_join(&rain, ["days"], ["days"]).unwrap(); assert_eq!( @@ -96,12 +96,12 @@ fn test_left_join() { assert_eq!(joined.column("rain").unwrap().null_count(), 3); // test join on string - let s0 = Series::new("days", &["mo", "tue", "wed", "thu", "fri"]); - let s1 = Series::new("temp", &[22.1, 19.9, 7., 2., 3.]); + let s0 = Series::new("days".into(), &["mo", "tue", "wed", "thu", "fri"]); + let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); let temp = DataFrame::new(vec![s0, s1]).unwrap(); - let s0 = Series::new("days", &["tue", "wed"]); - let s1 = Series::new("rain", &[0.1, 0.2]); + let s0 = Series::new("days".into(), &["tue", "wed"]); + let s1 = Series::new("rain".into(), &[0.1, 0.2]); let rain = DataFrame::new(vec![s0, s1]).unwrap(); let joined = temp.left_join(&rain, ["days"], ["days"]).unwrap(); assert_eq!( @@ -152,12 +152,16 @@ fn test_full_outer_join() -> PolarsResult<()> { fn test_join_with_nulls() { let dts = &[20, 21, 22, 23, 24, 25, 27, 28]; let vals = &[1.2, 2.4, 4.67, 5.8, 4.4, 3.6, 7.6, 6.5]; - let df = DataFrame::new(vec![Series::new("date", dts), Series::new("val", vals)]).unwrap(); + let df = DataFrame::new(vec![ + Series::new("date".into(), dts), + Series::new("val".into(), vals), + ]) + .unwrap(); let vals2 = &[Some(1.1), None, Some(3.3), None, None]; let df2 = DataFrame::new(vec![ - Series::new("date", &dts[3..]), - Series::new("val2", vals2), + Series::new("date".into(), &dts[3..]), + Series::new("val2".into(), vals2), ]) .unwrap(); @@ -204,7 +208,7 @@ fn test_join_multiple_columns() { .str() .unwrap() + df_a.column("b").unwrap().str().unwrap(); - s.rename("dummy"); + s.rename("dummy".into()); df_a.with_column(s).unwrap(); let mut s = df_b @@ -215,7 +219,7 @@ fn test_join_multiple_columns() { .str() .unwrap() + df_b.column("bar").unwrap().str().unwrap(); - s.rename("dummy"); + s.rename("dummy".into()); df_b.with_column(s).unwrap(); let joined = df_a.left_join(&df_b, ["dummy"], ["dummy"]).unwrap(); @@ -334,14 +338,14 @@ fn test_join_categorical() { fn test_empty_df_join() -> PolarsResult<()> { let empty: Vec = vec![]; let empty_df = DataFrame::new(vec![ - Series::new("key", &empty), - Series::new("eval", &empty), + Series::new("key".into(), &empty), + Series::new("eval".into(), &empty), ]) .unwrap(); let df = DataFrame::new(vec![ - Series::new("key", &["foo"]), - Series::new("aval", &[4]), + Series::new("key".into(), &["foo"]), + Series::new("aval".into(), &[4]), ]) .unwrap(); @@ -357,8 +361,8 @@ fn test_empty_df_join() -> PolarsResult<()> { let empty: Vec = vec![]; let _empty_df = DataFrame::new(vec![ - Series::new("key", &empty), - Series::new("eval", &empty), + Series::new("key".into(), &empty), + Series::new("eval".into(), &empty), ]) .unwrap(); @@ -370,9 +374,9 @@ fn test_empty_df_join() -> PolarsResult<()> { // https://github.com/pola-rs/polars/issues/1824 let empty: Vec = vec![]; let empty_df = DataFrame::new(vec![ - Series::new("key", &empty), - Series::new("1val", &empty), - Series::new("2val", &empty), + Series::new("key".into(), &empty), + Series::new("1val".into(), &empty), + Series::new("2val".into(), &empty), ])?; let out = df.left_join(&empty_df, ["key"], ["key"])?; @@ -604,8 +608,8 @@ fn test_4_threads_bit_offset() -> PolarsResult<()> { let mut left_b = (0..n) .map(|i| if i % 2 == 0 { None } else { Some(0) }) .collect::(); - left_a.rename("a"); - left_b.rename("b"); + left_a.rename("a".into()); + left_b.rename("b".into()); let left_df = DataFrame::new(vec![left_a.into_series(), left_b.into_series()])?; let i = 1; @@ -615,8 +619,8 @@ fn test_4_threads_bit_offset() -> PolarsResult<()> { let mut right_b = range .map(|i| if i % 3 == 0 { None } else { Some(1) }) .collect::(); - right_a.rename("a"); - right_b.rename("b"); + right_a.rename("a".into()); + right_b.rename("b".into()); let right_df = DataFrame::new(vec![right_a.into_series(), right_b.into_series()])?; let out = JoinBuilder::new(left_df.lazy()) diff --git a/crates/polars/tests/it/core/list.rs b/crates/polars/tests/it/core/list.rs index d709a40f2be4..f485ccadd482 100644 --- a/crates/polars/tests/it/core/list.rs +++ b/crates/polars/tests/it/core/list.rs @@ -2,7 +2,7 @@ use polars::prelude::*; #[test] fn test_to_list_logical() -> PolarsResult<()> { - let ca = StringChunked::new("a", &["2021-01-01", "2021-01-02", "2021-01-03"]); + let ca = StringChunked::new("a".into(), &["2021-01-01", "2021-01-02", "2021-01-03"]); let out = ca.as_date(None, false)?.into_series(); let out = out.implode().unwrap(); assert_eq!(out.len(), 1); diff --git a/crates/polars/tests/it/core/ops/take.rs b/crates/polars/tests/it/core/ops/take.rs index 26c1bb651865..373c644da066 100644 --- a/crates/polars/tests/it/core/ops/take.rs +++ b/crates/polars/tests/it/core/ops/take.rs @@ -3,12 +3,12 @@ use super::*; #[test] fn test_list_gather_nulls_and_empty() { let a: &[i32] = &[]; - let a = Series::new("", a); - let b = Series::new("", &[None, Some(a.clone())]); + let a = Series::new("".into(), a); + let b = Series::new("".into(), &[None, Some(a.clone())]); let indices = [Some(0 as IdxSize), Some(1), None] .into_iter() - .collect_ca(""); + .collect_ca("".into()); let out = b.take(&indices).unwrap(); - let expected = Series::new("", &[None, Some(a), None]); + let expected = Series::new("".into(), &[None, Some(a), None]); assert!(out.equals_missing(&expected)) } diff --git a/crates/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs index b0e1b13ca9f4..85cf69ec1494 100644 --- a/crates/polars/tests/it/core/pivot.rs +++ b/crates/polars/tests/it/core/pivot.rs @@ -56,9 +56,9 @@ fn test_pivot_date_() -> PolarsResult<()> { #[test] fn test_pivot_old() { - let s0 = Series::new("index", ["A", "A", "B", "B", "C"].as_ref()); - let s2 = Series::new("columns", ["k", "l", "m", "m", "l"].as_ref()); - let s1 = Series::new("values", [1, 2, 2, 4, 2].as_ref()); + let s0 = Series::new("index".into(), ["A", "A", "B", "B", "C"].as_ref()); + let s2 = Series::new("columns".into(), ["k", "l", "m", "m", "l"].as_ref()); + let s1 = Series::new("values".into(), [1, 2, 2, 4, 2].as_ref()); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let pvt = pivot( diff --git a/crates/polars/tests/it/core/rolling_window.rs b/crates/polars/tests/it/core/rolling_window.rs index b823bf7d8736..a58280e09345 100644 --- a/crates/polars/tests/it/core/rolling_window.rs +++ b/crates/polars/tests/it/core/rolling_window.rs @@ -2,7 +2,7 @@ use super::*; #[test] fn test_rolling() { - let s = Int32Chunked::new("foo", &[1, 2, 3, 2, 1]).into_series(); + let s = Int32Chunked::new("foo".into(), &[1, 2, 3, 2, 1]).into_series(); let a = s .rolling_sum(RollingOptionsFixedWindow { window_size: 2, @@ -57,7 +57,7 @@ fn test_rolling() { #[test] fn test_rolling_min_periods() { - let s = Int32Chunked::new("foo", &[1, 2, 3, 2, 1]).into_series(); + let s = Int32Chunked::new("foo".into(), &[1, 2, 3, 2, 1]).into_series(); let a = s .rolling_max(RollingOptionsFixedWindow { window_size: 2, @@ -72,7 +72,7 @@ fn test_rolling_min_periods() { #[test] fn test_rolling_mean() { let s = Float64Chunked::new( - "foo", + "foo".into(), &[ Some(0.0), Some(1.0), @@ -141,7 +141,7 @@ fn test_rolling_mean() { ); // integers - let ca = Int32Chunked::from_slice("", &[1, 8, 6, 2, 16, 10]); + let ca = Int32Chunked::from_slice("".into(), &[1, 8, 6, 2, 16, 10]); let out = ca .into_series() .rolling_mean(RollingOptionsFixedWindow { @@ -163,7 +163,7 @@ fn test_rolling_mean() { #[test] fn test_rolling_map() { let ca = Float64Chunked::new( - "foo", + "foo".into(), &[ Some(0.0), Some(1.0), @@ -177,7 +177,7 @@ fn test_rolling_map() { let out = ca .rolling_map( - &|s| s.sum_reduce().unwrap().into_series(s.name()), + &|s| s.sum_reduce().unwrap().into_series(s.name().clone()), RollingOptionsFixedWindow { window_size: 3, min_periods: 3, @@ -197,7 +197,7 @@ fn test_rolling_map() { #[test] fn test_rolling_var() { let s = Float64Chunked::new( - "foo", + "foo".into(), &[ Some(0.0), Some(1.0), @@ -237,7 +237,7 @@ fn test_rolling_var() { &[None, None, Some(1), None, None, None, None,] ); - let s = Float64Chunked::from_slice("", &[0.0, 2.0, 8.0, 3.0, 12.0, 1.0]).into_series(); + let s = Float64Chunked::from_slice("".into(), &[0.0, 2.0, 8.0, 3.0, 12.0, 1.0]).into_series(); let out = s .rolling_var(options) .unwrap() diff --git a/crates/polars/tests/it/core/series.rs b/crates/polars/tests/it/core/series.rs index 017609898b51..3d740ad5d940 100644 --- a/crates/polars/tests/it/core/series.rs +++ b/crates/polars/tests/it/core/series.rs @@ -3,19 +3,19 @@ use polars::series::*; #[test] fn test_series_arithmetic() -> PolarsResult<()> { - let a = &Series::new("a", &[1, 100, 6, 40]); - let b = &Series::new("b", &[-1, 2, 3, 4]); - assert_eq!((a + b)?, Series::new("a", &[0, 102, 9, 44])); - assert_eq!((a - b)?, Series::new("a", &[2, 98, 3, 36])); - assert_eq!((a * b)?, Series::new("a", &[-1, 200, 18, 160])); - assert_eq!((a / b)?, Series::new("a", &[-1, 50, 2, 10])); + let a = &Series::new("a".into(), &[1, 100, 6, 40]); + let b = &Series::new("b".into(), &[-1, 2, 3, 4]); + assert_eq!((a + b)?, Series::new("a".into(), &[0, 102, 9, 44])); + assert_eq!((a - b)?, Series::new("a".into(), &[2, 98, 3, 36])); + assert_eq!((a * b)?, Series::new("a".into(), &[-1, 200, 18, 160])); + assert_eq!((a / b)?, Series::new("a".into(), &[-1, 50, 2, 10])); Ok(()) } #[test] fn test_min_max_sorted_asc() { - let a = &mut Series::new("a", &[1, 2, 3, 4]); + let a = &mut Series::new("a".into(), &[1, 2, 3, 4]); a.set_sorted_flag(IsSorted::Ascending); assert_eq!(a.max().unwrap(), Some(4)); assert_eq!(a.min().unwrap(), Some(1)); @@ -23,7 +23,7 @@ fn test_min_max_sorted_asc() { #[test] fn test_min_max_sorted_desc() { - let a = &mut Series::new("a", &[4, 3, 2, 1]); + let a = &mut Series::new("a".into(), &[4, 3, 2, 1]); a.set_sorted_flag(IsSorted::Descending); assert_eq!(a.max().unwrap(), Some(4)); assert_eq!(a.min().unwrap(), Some(1)); @@ -31,7 +31,13 @@ fn test_min_max_sorted_desc() { #[test] fn test_construct_list_of_null_series() { - let s = Series::new("a", [Series::new_null("a1", 1), Series::new_null("a1", 1)]); + let s = Series::new( + "a".into(), + [ + Series::new_null("a1".into(), 1), + Series::new_null("a1".into(), 1), + ], + ); assert_eq!(s.null_count(), 0); assert_eq!(s.field().name(), "a"); } diff --git a/crates/polars/tests/it/io/avro/read.rs b/crates/polars/tests/it/io/avro/read.rs index 2482fb6103c7..e3f799f02674 100644 --- a/crates/polars/tests/it/io/avro/read.rs +++ b/crates/polars/tests/it/io/avro/read.rs @@ -55,27 +55,31 @@ pub(super) fn schema() -> (AvroSchema, ArrowSchema) { "#; let schema = ArrowSchema::from(vec![ - Field::new("a", ArrowDataType::Int64, false), - Field::new("b", ArrowDataType::Utf8, false), - Field::new("c", ArrowDataType::Int32, false), - Field::new("date", ArrowDataType::Date32, false), - Field::new("d", ArrowDataType::Binary, false), - Field::new("e", ArrowDataType::Float64, false), - Field::new("f", ArrowDataType::Boolean, false), - Field::new("g", ArrowDataType::Utf8, true), + Field::new("a".into(), ArrowDataType::Int64, false), + Field::new("b".into(), ArrowDataType::Utf8, false), + Field::new("c".into(), ArrowDataType::Int32, false), + Field::new("date".into(), ArrowDataType::Date32, false), + Field::new("d".into(), ArrowDataType::Binary, false), + Field::new("e".into(), ArrowDataType::Float64, false), + Field::new("f".into(), ArrowDataType::Boolean, false), + Field::new("g".into(), ArrowDataType::Utf8, true), Field::new( - "h", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))), + "h".into(), + ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + true, + ))), false, ), Field::new( - "i", - ArrowDataType::Struct(vec![Field::new("e", ArrowDataType::Float64, false)]), + "i".into(), + ArrowDataType::Struct(vec![Field::new("e".into(), ArrowDataType::Float64, false)]), false, ), Field::new( - "nullable_struct", - ArrowDataType::Struct(vec![Field::new("e", ArrowDataType::Float64, false)]), + "nullable_struct".into(), + ArrowDataType::Struct(vec![Field::new("e".into(), ArrowDataType::Float64, false)]), true, ), ]); @@ -105,13 +109,13 @@ pub(super) fn data() -> RecordBatchT> { Utf8Array::::from([Some("foo"), None]).boxed(), array.into_box(), StructArray::new( - ArrowDataType::Struct(vec![Field::new("e", ArrowDataType::Float64, false)]), + ArrowDataType::Struct(vec![Field::new("e".into(), ArrowDataType::Float64, false)]), vec![PrimitiveArray::::from_slice([1.0, 2.0]).boxed()], None, ) .boxed(), StructArray::new( - ArrowDataType::Struct(vec![Field::new("e", ArrowDataType::Float64, false)]), + ArrowDataType::Struct(vec![Field::new("e".into(), ArrowDataType::Float64, false)]), vec![PrimitiveArray::::from_slice([1.0, 0.0]).boxed()], Some([true, false].into()), ) @@ -298,8 +302,12 @@ fn schema_list() -> (AvroSchema, ArrowSchema) { "#; let schema = ArrowSchema::from(vec![Field::new( - "h", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, false))), + "h".into(), + ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + false, + ))), false, )]); @@ -311,7 +319,11 @@ pub(super) fn data_list() -> RecordBatchT> { let mut array = MutableListArray::>::new_from( Default::default(), - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, false))), + ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + false, + ))), 0, ); array.try_extend(data).unwrap(); diff --git a/crates/polars/tests/it/io/avro/write.rs b/crates/polars/tests/it/io/avro/write.rs index dade870e96c6..061eb52f88e9 100644 --- a/crates/polars/tests/it/io/avro/write.rs +++ b/crates/polars/tests/it/io/avro/write.rs @@ -16,38 +16,54 @@ use super::read::read_avro; pub(super) fn schema() -> ArrowSchema { ArrowSchema::from(vec![ - Field::new("int64", ArrowDataType::Int64, false), - Field::new("int64 nullable", ArrowDataType::Int64, true), - Field::new("utf8", ArrowDataType::Utf8, false), - Field::new("utf8 nullable", ArrowDataType::Utf8, true), - Field::new("int32", ArrowDataType::Int32, false), - Field::new("int32 nullable", ArrowDataType::Int32, true), - Field::new("date", ArrowDataType::Date32, false), - Field::new("date nullable", ArrowDataType::Date32, true), - Field::new("binary", ArrowDataType::Binary, false), - Field::new("binary nullable", ArrowDataType::Binary, true), - Field::new("float32", ArrowDataType::Float32, false), - Field::new("float32 nullable", ArrowDataType::Float32, true), - Field::new("float64", ArrowDataType::Float64, false), - Field::new("float64 nullable", ArrowDataType::Float64, true), - Field::new("boolean", ArrowDataType::Boolean, false), - Field::new("boolean nullable", ArrowDataType::Boolean, true), + Field::new("int64".into(), ArrowDataType::Int64, false), + Field::new("int64 nullable".into(), ArrowDataType::Int64, true), + Field::new("utf8".into(), ArrowDataType::Utf8, false), + Field::new("utf8 nullable".into(), ArrowDataType::Utf8, true), + Field::new("int32".into(), ArrowDataType::Int32, false), + Field::new("int32 nullable".into(), ArrowDataType::Int32, true), + Field::new("date".into(), ArrowDataType::Date32, false), + Field::new("date nullable".into(), ArrowDataType::Date32, true), + Field::new("binary".into(), ArrowDataType::Binary, false), + Field::new("binary nullable".into(), ArrowDataType::Binary, true), + Field::new("float32".into(), ArrowDataType::Float32, false), + Field::new("float32 nullable".into(), ArrowDataType::Float32, true), + Field::new("float64".into(), ArrowDataType::Float64, false), + Field::new("float64 nullable".into(), ArrowDataType::Float64, true), + Field::new("boolean".into(), ArrowDataType::Boolean, false), + Field::new("boolean nullable".into(), ArrowDataType::Boolean, true), Field::new( - "list", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))), + "list".into(), + ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + true, + ))), false, ), Field::new( - "list nullable", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))), + "list nullable".into(), + ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + true, + ))), true, ), ]) } pub(super) fn data() -> RecordBatchT> { - let list_dt = ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))); - let list_dt1 = ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))); + let list_dt = ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + true, + ))); + let list_dt1 = ArrowDataType::List(Box::new(Field::new( + "item".into(), + ArrowDataType::Int32, + true, + ))); let columns = vec![ Box::new(Int64Array::from_slice([27, 47])) as Box, @@ -163,10 +179,14 @@ fn deflate() -> PolarsResult<()> { fn large_format_schema() -> ArrowSchema { ArrowSchema::from(vec![ - Field::new("large_utf8", ArrowDataType::LargeUtf8, false), - Field::new("large_utf8_nullable", ArrowDataType::LargeUtf8, true), - Field::new("large_binary", ArrowDataType::LargeBinary, false), - Field::new("large_binary_nullable", ArrowDataType::LargeBinary, true), + Field::new("large_utf8".into(), ArrowDataType::LargeUtf8, false), + Field::new("large_utf8_nullable".into(), ArrowDataType::LargeUtf8, true), + Field::new("large_binary".into(), ArrowDataType::LargeBinary, false), + Field::new( + "large_binary_nullable".into(), + ArrowDataType::LargeBinary, + true, + ), ]) } @@ -182,10 +202,10 @@ fn large_format_data() -> RecordBatchT> { fn large_format_expected_schema() -> ArrowSchema { ArrowSchema::from(vec![ - Field::new("large_utf8", ArrowDataType::Utf8, false), - Field::new("large_utf8_nullable", ArrowDataType::Utf8, true), - Field::new("large_binary", ArrowDataType::Binary, false), - Field::new("large_binary_nullable", ArrowDataType::Binary, true), + Field::new("large_utf8".into(), ArrowDataType::Utf8, false), + Field::new("large_utf8_nullable".into(), ArrowDataType::Utf8, true), + Field::new("large_binary".into(), ArrowDataType::Binary, false), + Field::new("large_binary_nullable".into(), ArrowDataType::Binary, true), ]) } @@ -221,18 +241,18 @@ fn check_large_format() -> PolarsResult<()> { fn struct_schema() -> ArrowSchema { ArrowSchema::from(vec![ Field::new( - "struct", + "struct".into(), ArrowDataType::Struct(vec![ - Field::new("item1", ArrowDataType::Int32, false), - Field::new("item2", ArrowDataType::Int32, true), + Field::new("item1".into(), ArrowDataType::Int32, false), + Field::new("item2".into(), ArrowDataType::Int32, true), ]), false, ), Field::new( - "struct nullable", + "struct nullable".into(), ArrowDataType::Struct(vec![ - Field::new("item1", ArrowDataType::Int32, false), - Field::new("item2", ArrowDataType::Int32, true), + Field::new("item1".into(), ArrowDataType::Int32, false), + Field::new("item2".into(), ArrowDataType::Int32, true), ]), true, ), @@ -241,8 +261,8 @@ fn struct_schema() -> ArrowSchema { fn struct_data() -> RecordBatchT> { let struct_dt = ArrowDataType::Struct(vec![ - Field::new("item1", ArrowDataType::Int32, false), - Field::new("item2", ArrowDataType::Int32, true), + Field::new("item1".into(), ArrowDataType::Int32, false), + Field::new("item2".into(), ArrowDataType::Int32, true), ]); RecordBatchT::new(vec![ diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 1b78969093e2..8b5f10bf8aaf 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -44,10 +44,16 @@ fn write_csv() { fn write_dates() { use polars_core::export::chrono; - let s0 = Series::new("date", [chrono::NaiveDate::from_yo_opt(2024, 33), None]); - let s1 = Series::new("time", [None, chrono::NaiveTime::from_hms_opt(19, 50, 0)]); + let s0 = Series::new( + "date".into(), + [chrono::NaiveDate::from_yo_opt(2024, 33), None], + ); + let s1 = Series::new( + "time".into(), + [None, chrono::NaiveTime::from_hms_opt(19, 50, 0)], + ); let s2 = Series::new( - "datetime", + "datetime".into(), [ Some(chrono::NaiveDateTime::new( chrono::NaiveDate::from_ymd_opt(2000, 12, 1).unwrap(), @@ -112,7 +118,7 @@ fn write_dates() { let with_timezone = polars_ops::chunked_array::replace_time_zone( s2.slice(0, 1).datetime().unwrap(), Some("America/New_York"), - &StringChunked::new("", ["raise"]), + &StringChunked::new("".into(), ["raise"]), NonExistent::Raise, ) .unwrap() @@ -214,7 +220,7 @@ fn test_parser() -> PolarsResult<()> { assert_eq!(col.get(0)?, AnyValue::String("Setosa")); assert_eq!(col.get(2)?, AnyValue::String("Setosa")); - assert_eq!("sepal_length", df.get_columns()[0].name()); + assert_eq!("sepal_length", df.get_columns()[0].name().as_str()); assert_eq!(1, df.column("sepal_length").unwrap().chunks().len()); assert_eq!(df.height(), 7); @@ -229,7 +235,7 @@ fn test_parser() -> PolarsResult<()> { .finish() .unwrap(); - assert_eq!("head_1", df.get_columns()[0].name()); + assert_eq!("head_1", df.get_columns()[0].name().as_str()); assert_eq!(df.shape(), (3, 2)); // test windows line ending with 1 byte char column and no line endings for last line. @@ -243,7 +249,7 @@ fn test_parser() -> PolarsResult<()> { .finish() .unwrap(); - assert_eq!("head_1", df.get_columns()[0].name()); + assert_eq!("head_1", df.get_columns()[0].name().as_str()); assert_eq!(df.shape(), (3, 1)); Ok(()) } @@ -303,15 +309,15 @@ fn test_missing_data() { assert!(df .column("column_1") .unwrap() - .equals(&Series::new("column_1", &[1_i64, 1]))); + .equals(&Series::new("column_1".into(), &[1_i64, 1]))); assert!(df .column("column_2") .unwrap() - .equals_missing(&Series::new("column_2", &[Some(2_i64), None]))); + .equals_missing(&Series::new("column_2".into(), &[Some(2_i64), None]))); assert!(df .column("column_3") .unwrap() - .equals(&Series::new("column_3", &[3_i64, 3]))); + .equals(&Series::new("column_3".into(), &[3_i64, 3]))); } #[test] @@ -326,7 +332,7 @@ fn test_escape_comma() { assert!(df .column("column_3") .unwrap() - .equals(&Series::new("column_3", &[11_i64, 12]))); + .equals(&Series::new("column_3".into(), &[11_i64, 12]))); } #[test] @@ -339,7 +345,7 @@ fn test_escape_double_quotes() { let df = CsvReader::new(file).finish().unwrap(); assert_eq!(df.shape(), (2, 3)); assert!(df.column("column_2").unwrap().equals(&Series::new( - "column_2", + "column_2".into(), &[ r#"with "double quotes" US"#, r#"with "double quotes followed", by comma"# @@ -397,7 +403,7 @@ hello,","," ",world,"!" assert!(df .column(col) .unwrap() - .equals(&Series::new(col, &[&**val; 4]))); + .equals(&Series::new(col.into(), &[&**val; 4]))); } } @@ -420,7 +426,7 @@ versions of Lorem Ipsum.",11 .unwrap(); assert!(df.column("column_2").unwrap().equals(&Series::new( - "column_2", + "column_2".into(), &[ r#"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since th @@ -508,13 +514,13 @@ fn test_quoted_numeric() { #[test] fn test_empty_bytes_to_dataframe() { - let fields = vec![Field::new("test_field", DataType::String)]; + let fields = vec![Field::new("test_field".into(), DataType::String)]; let schema = Schema::from_iter(fields); let file = Cursor::new(vec![]); let result = CsvReadOptions::default() .with_has_header(false) - .with_columns(Some(schema.iter_names().map(|s| s.to_string()).collect())) + .with_columns(Some(schema.iter_names().cloned().collect())) .with_schema(Some(Arc::new(schema))) .into_reader_with_file_handle(file) .finish(); @@ -548,9 +554,9 @@ fn test_missing_value() { let df = CsvReadOptions::default() .with_has_header(true) .with_schema(Some(Arc::new(Schema::from_iter([ - Field::new("foo", DataType::UInt32), - Field::new("bar", DataType::UInt32), - Field::new("ham", DataType::UInt32), + Field::new("foo".into(), DataType::UInt32), + Field::new("bar".into(), DataType::UInt32), + Field::new("ham".into(), DataType::UInt32), ])))) .into_reader_with_file_handle(file) .finish() @@ -571,7 +577,7 @@ AUDCAD,1616455921,0.96212,0.95666,1 let df = CsvReadOptions::default() .with_has_header(true) .with_schema_overwrite(Some(Arc::new(Schema::from_iter([Field::new( - "b", + "b".into(), DataType::Datetime(TimeUnit::Nanoseconds, None), )])))) .with_ignore_errors(true) @@ -730,8 +736,7 @@ null-value,b,bar let file = Cursor::new(csv); let df = CsvReadOptions::default() .map_parse_options(|parse_options| { - parse_options - .with_null_values(Some(NullValues::AllColumnsSingle("null-value".to_string()))) + parse_options.with_null_values(Some(NullValues::AllColumnsSingle("null-value".into()))) }) .into_reader_with_file_handle(file) .finish()?; diff --git a/crates/polars/tests/it/io/ipc.rs b/crates/polars/tests/it/io/ipc.rs index 6b5e2a83ba41..8a5602c86051 100644 --- a/crates/polars/tests/it/io/ipc.rs +++ b/crates/polars/tests/it/io/ipc.rs @@ -24,8 +24,8 @@ fn test_ipc_compression_variadic_buffers() { #[cfg(test)] pub(crate) fn create_df() -> DataFrame { - let s0 = Series::new("days", [0, 1, 2, 3, 4].as_ref()); - let s1 = Series::new("temp", [22.1, 19.9, 7., 2., 3.].as_ref()); + let s0 = Series::new("days".into(), [0, 1, 2, 3, 4].as_ref()); + let s1 = Series::new("temp".into(), [22.1, 19.9, 7., 2., 3.].as_ref()); DataFrame::new(vec![s0, s1]).unwrap() } @@ -140,7 +140,7 @@ fn test_write_with_compression() { #[test] fn write_and_read_ipc_empty_series() { let mut buf: Cursor> = Cursor::new(Vec::new()); - let chunked_array = Float64Chunked::new("empty", &[0_f64; 0]); + let chunked_array = Float64Chunked::new("empty".into(), &[0_f64; 0]); let mut df = DataFrame::new(vec![chunked_array.into_series()]).unwrap(); IpcWriter::new(&mut buf) .finish(&mut df) diff --git a/crates/polars/tests/it/io/ipc_stream.rs b/crates/polars/tests/it/io/ipc_stream.rs index 18d67990cb53..d12082d0dd71 100644 --- a/crates/polars/tests/it/io/ipc_stream.rs +++ b/crates/polars/tests/it/io/ipc_stream.rs @@ -145,7 +145,10 @@ mod test { #[test] fn write_and_read_ipc_stream_empty_series() { fn df() -> DataFrame { - DataFrame::new(vec![Float64Chunked::new("empty", &[0_f64; 0]).into_series()]).unwrap() + DataFrame::new(vec![ + Float64Chunked::new("empty".into(), &[0_f64; 0]).into_series() + ]) + .unwrap() } let reader = create_ipc_stream(df()); diff --git a/crates/polars/tests/it/io/json.rs b/crates/polars/tests/it/io/json.rs index faf17d71d07e..de8253a9656f 100644 --- a/crates/polars/tests/it/io/json.rs +++ b/crates/polars/tests/it/io/json.rs @@ -25,8 +25,8 @@ fn read_json() { .with_batch_size(NonZeroUsize::new(3).unwrap()) .finish() .unwrap(); - assert_eq!("a", df.get_columns()[0].name()); - assert_eq!("d", df.get_columns()[3].name()); + assert_eq!("a", df.get_columns()[0].name().as_str()); + assert_eq!("d", df.get_columns()[3].name().as_str()); assert_eq!((12, 4), df.shape()); } #[test] @@ -53,8 +53,8 @@ fn read_json_with_whitespace() { .with_batch_size(NonZeroUsize::new(3).unwrap()) .finish() .unwrap(); - assert_eq!("a", df.get_columns()[0].name()); - assert_eq!("d", df.get_columns()[3].name()); + assert_eq!("a", df.get_columns()[0].name().as_str()); + assert_eq!("d", df.get_columns()[3].name().as_str()); assert_eq!((12, 4), df.shape()); } #[test] @@ -76,12 +76,12 @@ fn read_json_with_escapes() { .infer_schema_len(NonZeroUsize::new(6)) .finish() .unwrap(); - assert_eq!("id", df.get_columns()[0].name()); + assert_eq!("id", df.get_columns()[0].name().as_str()); assert_eq!( AnyValue::String("\""), df.column("text").unwrap().get(0).unwrap() ); - assert_eq!("text", df.get_columns()[1].name()); + assert_eq!("text", df.get_columns()[1].name().as_str()); assert_eq!((10, 3), df.shape()); } @@ -107,8 +107,8 @@ fn read_unordered_json() { .with_batch_size(NonZeroUsize::new(3).unwrap()) .finish() .unwrap(); - assert_eq!("a", df.get_columns()[0].name()); - assert_eq!("d", df.get_columns()[3].name()); + assert_eq!("a", df.get_columns()[0].name().as_str()); + assert_eq!("d", df.get_columns()[3].name().as_str()); assert_eq!((12, 4), df.shape()); } @@ -141,9 +141,15 @@ fn test_read_ndjson_iss_5875() { let df = JsonLineReader::new(cursor).finish(); assert!(df.is_ok()); - let field_int_inner = Field::new("int_inner", DataType::List(Box::new(DataType::Int64))); - let field_float_inner = Field::new("float_inner", DataType::Float64); - let field_str_inner = Field::new("str_inner", DataType::List(Box::new(DataType::String))); + let field_int_inner = Field::new( + "int_inner".into(), + DataType::List(Box::new(DataType::Int64)), + ); + let field_float_inner = Field::new("float_inner".into(), DataType::Float64); + let field_str_inner = Field::new( + "str_inner".into(), + DataType::List(Box::new(DataType::String)), + ); let mut schema = Schema::new(); schema.with_column( diff --git a/crates/polars/tests/it/io/mod.rs b/crates/polars/tests/it/io/mod.rs index 4835171721c9..2fd9aab899d1 100644 --- a/crates/polars/tests/it/io/mod.rs +++ b/crates/polars/tests/it/io/mod.rs @@ -17,7 +17,7 @@ mod ipc_stream; use polars::prelude::*; pub(crate) fn create_df() -> DataFrame { - let s0 = Series::new("days", [0, 1, 2, 3, 4].as_ref()); - let s1 = Series::new("temp", [22.1, 19.9, 7., 2., 3.].as_ref()); + let s0 = Series::new("days".into(), [0, 1, 2, 3, 4].as_ref()); + let s1 = Series::new("temp".into(), [22.1, 19.9, 7., 2., 3.].as_ref()); DataFrame::new(vec![s0, s1]).unwrap() } diff --git a/crates/polars/tests/it/io/parquet/arrow/mod.rs b/crates/polars/tests/it/io/parquet/arrow/mod.rs index f5e0b2e39e3d..7d90679f8804 100644 --- a/crates/polars/tests/it/io/parquet/arrow/mod.rs +++ b/crates/polars/tests/it/io/parquet/arrow/mod.rs @@ -25,7 +25,7 @@ fn new_struct( let fields = names .into_iter() .zip(arrays.iter()) - .map(|(n, a)| Field::new(n, a.data_type().clone(), true)) + .map(|(n, a)| Field::new(n.into(), a.data_type().clone(), true)) .collect(); StructArray::new(ArrowDataType::Struct(fields), arrays, validity) } @@ -75,7 +75,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { // ] let a = ListArray::::new( ArrowDataType::LargeList(Box::new(Field::new( - "item", + "item".into(), ArrowDataType::Utf8View, true, ))), @@ -84,7 +84,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { None, ); StructArray::new( - ArrowDataType::Struct(vec![Field::new("f1", a.data_type().clone(), true)]), + ArrowDataType::Struct(vec![Field::new("f1".into(), a.data_type().clone(), true)]), vec![a.boxed()], None, ) @@ -94,7 +94,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { let values = pyarrow_nested_edge("struct_list_nullable"); ListArray::::new( ArrowDataType::LargeList(Box::new(Field::new( - "item", + "item".into(), values.data_type().clone(), true, ))), @@ -305,7 +305,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { let array = ListArray::::new( ArrowDataType::LargeList(Box::new(Field::new( - "item", + "item".into(), array.data_type().clone(), true, ))), @@ -344,14 +344,20 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { match column { "list_int64_required_required" => { // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - let data_type = - ArrowDataType::LargeList(Box::new(Field::new("item", ArrowDataType::Int64, false))); + let data_type = ArrowDataType::LargeList(Box::new(Field::new( + "item".into(), + ArrowDataType::Int64, + false, + ))); ListArray::::new(data_type, offsets, values, None).boxed() }, "list_int64_optional_required" => { // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - let data_type = - ArrowDataType::LargeList(Box::new(Field::new("item", ArrowDataType::Int64, true))); + let data_type = ArrowDataType::LargeList(Box::new(Field::new( + "item".into(), + ArrowDataType::Int64, + true, + ))); ListArray::::new(data_type, offsets, values, None).boxed() }, "list_nested_i64" => { @@ -413,16 +419,22 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { "struct_list_nullable" => new_struct(vec![values], vec!["a".to_string()], None).boxed(), _ => { let field = match column { - "list_int64" => Field::new("item", ArrowDataType::Int64, true), - "list_int64_required" => Field::new("item", ArrowDataType::Int64, false), - "list_int16" => Field::new("item", ArrowDataType::Int16, true), - "list_bool" => Field::new("item", ArrowDataType::Boolean, true), - "list_utf8" => Field::new("item", ArrowDataType::Utf8View, true), - "list_large_binary" => Field::new("item", ArrowDataType::LargeBinary, true), - "list_decimal" => Field::new("item", ArrowDataType::Decimal(9, 0), true), - "list_decimal256" => Field::new("item", ArrowDataType::Decimal256(9, 0), true), - "list_struct_nullable" => Field::new("item", values.data_type().clone(), true), - "list_struct_list_nullable" => Field::new("item", values.data_type().clone(), true), + "list_int64" => Field::new("item".into(), ArrowDataType::Int64, true), + "list_int64_required" => Field::new("item".into(), ArrowDataType::Int64, false), + "list_int16" => Field::new("item".into(), ArrowDataType::Int16, true), + "list_bool" => Field::new("item".into(), ArrowDataType::Boolean, true), + "list_utf8" => Field::new("item".into(), ArrowDataType::Utf8View, true), + "list_large_binary" => Field::new("item".into(), ArrowDataType::LargeBinary, true), + "list_decimal" => Field::new("item".into(), ArrowDataType::Decimal(9, 0), true), + "list_decimal256" => { + Field::new("item".into(), ArrowDataType::Decimal256(9, 0), true) + }, + "list_struct_nullable" => { + Field::new("item".into(), values.data_type().clone(), true) + }, + "list_struct_list_nullable" => { + Field::new("item".into(), values.data_type().clone(), true) + }, other => unreachable!("{}", other), }; @@ -520,7 +532,7 @@ pub fn pyarrow_nullable(column: &str) -> Box { .to(ArrowDataType::Timestamp(TimeUnit::Second, None)), ), "timestamp_s_utc" => Box::new(PrimitiveArray::::from(i64_values).to( - ArrowDataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), + ArrowDataType::Timestamp(TimeUnit::Second, Some("UTC".into())), )), _ => unreachable!(), } @@ -609,11 +621,11 @@ pub fn pyarrow_nullable_statistics(column: &str) -> Statistics { null_count: UInt64Array::from([Some(3)]).boxed(), min_value: Box::new(Int64Array::from_slice([-256]).to(ArrowDataType::Timestamp( TimeUnit::Second, - Some("UTC".to_string()), + Some("UTC".into()), ))), max_value: Box::new(Int64Array::from_slice([9]).to(ArrowDataType::Timestamp( TimeUnit::Second, - Some("UTC".to_string()), + Some("UTC".into()), ))), }, _ => unreachable!(), @@ -666,7 +678,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { let new_list = |array: Box, nullable: bool| { ListArray::::new( ArrowDataType::LargeList(Box::new(Field::new( - "item", + "item".into(), array.data_type().clone(), nullable, ))), @@ -913,7 +925,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { let new_list = |array: Box| { ListArray::::new( ArrowDataType::LargeList(Box::new(Field::new( - "item", + "item".into(), array.data_type().clone(), true, ))), @@ -927,7 +939,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { let fields = names .into_iter() .zip(arrays.iter()) - .map(|(n, a)| Field::new(n, a.data_type().clone(), true)) + .map(|(n, a)| Field::new(n.into(), a.data_type().clone(), true)) .collect(); StructArray::new(ArrowDataType::Struct(fields), arrays, None) }; @@ -1031,8 +1043,8 @@ pub fn pyarrow_struct(column: &str) -> Box { let mask = [true, true, false, true, true, true, true, true, true, true]; let fields = vec![ - Field::new("f1", ArrowDataType::Utf8View, true), - Field::new("f2", ArrowDataType::Boolean, true), + Field::new("f1".into(), ArrowDataType::Utf8View, true), + Field::new("f2".into(), ArrowDataType::Boolean, true), ]; match column { "struct" => { @@ -1046,8 +1058,8 @@ pub fn pyarrow_struct(column: &str) -> Box { let struct_ = pyarrow_struct("struct"); Box::new(StructArray::new( ArrowDataType::Struct(vec![ - Field::new("f1", ArrowDataType::Struct(fields), true), - Field::new("f2", ArrowDataType::Boolean, true), + Field::new("f1".into(), ArrowDataType::Struct(fields), true), + Field::new("f2".into(), ArrowDataType::Boolean, true), ]), vec![struct_, boolean], None, @@ -1057,8 +1069,8 @@ pub fn pyarrow_struct(column: &str) -> Box { let struct_ = pyarrow_struct("struct"); Box::new(StructArray::new( ArrowDataType::Struct(vec![ - Field::new("f1", ArrowDataType::Struct(fields), true), - Field::new("f2", ArrowDataType::Boolean, true), + Field::new("f1".into(), ArrowDataType::Struct(fields), true), + Field::new("f2".into(), ArrowDataType::Boolean, true), ]), vec![struct_, boolean], Some(mask.into()), @@ -1311,7 +1323,7 @@ fn generic_data() -> PolarsResult<(ArrowSchema, RecordBatchT>)> { let values = PrimitiveArray::from_slice([1i64, 3]) .to(ArrowDataType::Timestamp( TimeUnit::Millisecond, - Some("UTC".to_string()), + Some("UTC".into()), )) .boxed(); let array7 = DictionaryArray::try_from_keys(indices.clone(), values).unwrap(); @@ -1335,18 +1347,18 @@ fn generic_data() -> PolarsResult<(ArrowSchema, RecordBatchT>)> { .to(ArrowDataType::Interval(IntervalUnit::YearMonth)); let schema = ArrowSchema::from(vec![ - Field::new("a1", array1.data_type().clone(), true), - Field::new("a2", array2.data_type().clone(), true), - Field::new("a3", array3.data_type().clone(), true), - Field::new("a4", array4.data_type().clone(), true), - Field::new("a6", array6.data_type().clone(), true), - Field::new("a7", array7.data_type().clone(), true), - Field::new("a8", array8.data_type().clone(), true), - Field::new("a9", array9.data_type().clone(), true), - Field::new("a10", array10.data_type().clone(), true), - Field::new("a11", array11.data_type().clone(), true), - Field::new("a12", array12.data_type().clone(), true), - Field::new("a13", array13.data_type().clone(), true), + Field::new("a1".into(), array1.data_type().clone(), true), + Field::new("a2".into(), array2.data_type().clone(), true), + Field::new("a3".into(), array3.data_type().clone(), true), + Field::new("a4".into(), array4.data_type().clone(), true), + Field::new("a6".into(), array6.data_type().clone(), true), + Field::new("a7".into(), array7.data_type().clone(), true), + Field::new("a8".into(), array8.data_type().clone(), true), + Field::new("a9".into(), array9.data_type().clone(), true), + Field::new("a10".into(), array10.data_type().clone(), true), + Field::new("a11".into(), array11.data_type().clone(), true), + Field::new("a12".into(), array12.data_type().clone(), true), + Field::new("a13".into(), array13.data_type().clone(), true), ]); let chunk = RecordBatchT::try_new(vec![ array1.boxed(), @@ -1428,7 +1440,7 @@ fn data>( ]; let mut array = MutableListArray::::new_with_field( MutablePrimitiveArray::::new(), - "item", + "item".into(), inner_is_nullable, ); array.try_extend(data).unwrap(); @@ -1441,7 +1453,7 @@ fn assert_array_roundtrip( limit: Option, ) -> PolarsResult<()> { let schema = ArrowSchema::from(vec![Field::new( - "a1", + "a1".into(), array.data_type().clone(), is_nullable, )]); @@ -1496,7 +1508,7 @@ fn list_slice() -> PolarsResult<()> { ]; let mut array = MutableListArray::::new_with_field( MutablePrimitiveArray::::new(), - "item", + "item".into(), true, ); array.try_extend(data).unwrap(); @@ -1533,7 +1545,7 @@ fn list_int_nullable() -> PolarsResult<()> { ]; let mut array = MutableListArray::::new_with_field( MutablePrimitiveArray::::new(), - "item", + "item".into(), true, ); array.try_extend(data).unwrap(); @@ -1564,7 +1576,7 @@ fn nested_dict_data( let values = DictionaryArray::try_from_keys(indices, values).unwrap(); let values = LargeListArray::try_new( ArrowDataType::LargeList(Box::new(Field::new( - "item", + "item".into(), values.data_type().clone(), false, ))), @@ -1573,7 +1585,11 @@ fn nested_dict_data( Some([true, false, true, true].into()), )?; - let schema = ArrowSchema::from(vec![Field::new("c1", values.data_type().clone(), true)]); + let schema = ArrowSchema::from(vec![Field::new( + "c1".into(), + values.data_type().clone(), + true, + )]); let chunk = RecordBatchT::try_new(vec![values.boxed()])?; Ok((schema, chunk)) @@ -1604,7 +1620,7 @@ fn nested_dict_limit() -> PolarsResult<()> { fn filter_chunk() -> PolarsResult<()> { let chunk1 = RecordBatchT::new(vec![PrimitiveArray::from_slice([1i16, 3]).boxed()]); let chunk2 = RecordBatchT::new(vec![PrimitiveArray::from_slice([2i16, 4]).boxed()]); - let schema = ArrowSchema::from(vec![Field::new("c1", ArrowDataType::Int16, true)]); + let schema = ArrowSchema::from(vec![Field::new("c1".into(), ArrowDataType::Int16, true)]); let r = integration_write(&schema, &[chunk1.clone(), chunk2.clone()])?; diff --git a/crates/polars/tests/it/io/parquet/arrow/read.rs b/crates/polars/tests/it/io/parquet/arrow/read.rs index ffff1c99667b..974f67b0e879 100644 --- a/crates/polars/tests/it/io/parquet/arrow/read.rs +++ b/crates/polars/tests/it/io/parquet/arrow/read.rs @@ -127,7 +127,7 @@ fn read_int96_timestamps() -> PolarsResult<()> { let metadata = read_metadata(&mut reader)?; let schema = arrow::datatypes::ArrowSchema { fields: vec![arrow::datatypes::Field::new( - "timestamps", + "timestamps".into(), arrow::datatypes::ArrowDataType::Timestamp(time_unit, None), false, )], diff --git a/crates/polars/tests/it/io/parquet/arrow/write.rs b/crates/polars/tests/it/io/parquet/arrow/write.rs index 4b3d99aab3f7..50433be02b68 100644 --- a/crates/polars/tests/it/io/parquet/arrow/write.rs +++ b/crates/polars/tests/it/io/parquet/arrow/write.rs @@ -40,7 +40,7 @@ fn round_trip_opt_stats( _ => unreachable!(), }; - let field = Field::new("a1", array.data_type().clone(), true); + let field = Field::new("a1".into(), array.data_type().clone(), true); let schema = ArrowSchema::from(vec![field]); let options = WriteOptions { diff --git a/crates/polars/tests/it/io/parquet/mod.rs b/crates/polars/tests/it/io/parquet/mod.rs index 3c756bbea274..5d088aab3b15 100644 --- a/crates/polars/tests/it/io/parquet/mod.rs +++ b/crates/polars/tests/it/io/parquet/mod.rs @@ -113,7 +113,7 @@ pub fn alltypes_plain(column: &str) -> Array { pub fn alltypes_statistics(column: &str) -> Statistics { match column { "id" => PrimitiveStatistics:: { - primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::Int32), null_count: Some(0), distinct_count: None, min_value: Some(0), @@ -121,7 +121,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "id-short-array" => PrimitiveStatistics:: { - primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::Int32), null_count: Some(0), distinct_count: None, min_value: Some(4), @@ -136,7 +136,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "tinyint_col" | "smallint_col" | "int_col" => PrimitiveStatistics:: { - primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::Int32), null_count: Some(0), distinct_count: None, min_value: Some(0), @@ -144,7 +144,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "bigint_col" => PrimitiveStatistics:: { - primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int64), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::Int64), null_count: Some(0), distinct_count: None, min_value: Some(0), @@ -152,7 +152,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "float_col" => PrimitiveStatistics:: { - primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Float), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::Float), null_count: Some(0), distinct_count: None, min_value: Some(0.0), @@ -160,7 +160,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "double_col" => PrimitiveStatistics:: { - primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Double), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::Double), null_count: Some(0), distinct_count: None, min_value: Some(0.0), @@ -168,10 +168,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "date_string_col" => BinaryStatistics { - primitive_type: PrimitiveType::from_physical( - "col".to_string(), - PhysicalType::ByteArray, - ), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::ByteArray), null_count: Some(0), distinct_count: None, min_value: Some(vec![48, 49, 47, 48, 49, 47, 48, 57]), @@ -179,10 +176,7 @@ pub fn alltypes_statistics(column: &str) -> Statistics { } .into(), "string_col" => BinaryStatistics { - primitive_type: PrimitiveType::from_physical( - "col".to_string(), - PhysicalType::ByteArray, - ), + primitive_type: PrimitiveType::from_physical("col".into(), PhysicalType::ByteArray), null_count: Some(0), distinct_count: None, min_value: Some(vec![48]), diff --git a/crates/polars/tests/it/io/parquet/roundtrip.rs b/crates/polars/tests/it/io/parquet/roundtrip.rs index bc77d50afd8c..31c305c4db40 100644 --- a/crates/polars/tests/it/io/parquet/roundtrip.rs +++ b/crates/polars/tests/it/io/parquet/roundtrip.rs @@ -18,7 +18,7 @@ fn round_trip( compression: CompressionOptions, encodings: Vec, ) -> PolarsResult<()> { - let field = Field::new("a1", array.data_type().clone(), true); + let field = Field::new("a1".into(), array.data_type().clone(), true); let schema = ArrowSchema::from(vec![field]); let options = WriteOptions { diff --git a/crates/polars/tests/it/io/parquet/write/mod.rs b/crates/polars/tests/it/io/parquet/write/mod.rs index 9d1686ffdf87..02715030fb14 100644 --- a/crates/polars/tests/it/io/parquet/write/mod.rs +++ b/crates/polars/tests/it/io/parquet/write/mod.rs @@ -67,8 +67,8 @@ fn test_column(column: &str, compression: CompressionOptions) -> ParquetResult<( }; let schema = SchemaDescriptor::new( - "schema".to_string(), - vec![ParquetType::from_physical("col".to_string(), type_)], + "schema".into(), + vec![ParquetType::from_physical("col".into(), type_)], ); let a = schema.columns(); @@ -181,9 +181,9 @@ fn basic() -> ParquetResult<()> { }; let schema = SchemaDescriptor::new( - "schema".to_string(), + "schema".into(), vec![ParquetType::from_physical( - "col".to_string(), + "col".into(), PhysicalType::Int32, )], ); diff --git a/crates/polars/tests/it/io/parquet/write/sidecar.rs b/crates/polars/tests/it/io/parquet/write/sidecar.rs index 4df35d9e817d..00f4397ba6f4 100644 --- a/crates/polars/tests/it/io/parquet/write/sidecar.rs +++ b/crates/polars/tests/it/io/parquet/write/sidecar.rs @@ -6,11 +6,8 @@ use polars_parquet::parquet::write::{write_metadata_sidecar, FileWriter, Version #[test] fn basic() -> Result<(), ParquetError> { let schema = SchemaDescriptor::new( - "schema".to_string(), - vec![ParquetType::from_physical( - "c1".to_string(), - PhysicalType::Int32, - )], + "schema".into(), + vec![ParquetType::from_physical("c1".into(), PhysicalType::Int32)], ); let mut metadatas = vec![]; diff --git a/crates/polars/tests/it/joins.rs b/crates/polars/tests/it/joins.rs index 37ed6e2720d5..0fa0ba1c66a9 100644 --- a/crates/polars/tests/it/joins.rs +++ b/crates/polars/tests/it/joins.rs @@ -36,10 +36,14 @@ fn join_nans_outer() -> PolarsResult<()> { #[test] #[cfg(feature = "lazy")] fn join_empty_datasets() -> PolarsResult<()> { - let a = DataFrame::new(Vec::from([Series::new_empty("foo", &DataType::Int64)])).unwrap(); + let a = DataFrame::new(Vec::from([Series::new_empty( + "foo".into(), + &DataType::Int64, + )])) + .unwrap(); let b = DataFrame::new(Vec::from([ - Series::new_empty("foo", &DataType::Int64), - Series::new_empty("bar", &DataType::Int64), + Series::new_empty("foo".into(), &DataType::Int64), + Series::new_empty("bar".into(), &DataType::Int64), ])) .unwrap(); diff --git a/crates/polars/tests/it/lazy/aggregation.rs b/crates/polars/tests/it/lazy/aggregation.rs index 33662c442959..ad433e139775 100644 --- a/crates/polars/tests/it/lazy/aggregation.rs +++ b/crates/polars/tests/it/lazy/aggregation.rs @@ -4,7 +4,7 @@ use super::*; #[cfg(feature = "temporal")] fn test_lazy_agg() { let s0 = DateChunked::parse_from_str_slice( - "date", + "date".into(), &[ "2020-08-21", "2020-08-21", @@ -15,8 +15,8 @@ fn test_lazy_agg() { "%Y-%m-%d", ) .into_series(); - let s1 = Series::new("temp", [20, 10, 7, 9, 1].as_ref()); - let s2 = Series::new("rain", [0.2, 0.1, 0.3, 0.1, 0.01].as_ref()); + let s1 = Series::new("temp".into(), [20, 10, 7, 9, 1].as_ref()); + let s2 = Series::new("rain".into(), [0.2, 0.1, 0.3, 0.1, 0.01].as_ref()); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let lf = df @@ -33,7 +33,7 @@ fn test_lazy_agg() { let new = lf.collect().unwrap(); let min = new.column("min").unwrap(); - assert_eq!(min, &Series::new("min", [0.1f64, 0.01, 0.1])); + assert_eq!(min, &Series::new("min".into(), [0.1f64, 0.01, 0.1])); } #[test] diff --git a/crates/polars/tests/it/lazy/cwc.rs b/crates/polars/tests/it/lazy/cwc.rs index ae836354982e..2ad0ab11ede4 100644 --- a/crates/polars/tests/it/lazy/cwc.rs +++ b/crates/polars/tests/it/lazy/cwc.rs @@ -76,7 +76,7 @@ fn fuzz_cluster_with_columns() { let column = rng.gen_range(0..unused_cols.len()); let column = unused_cols.swap_remove(column); - series.push(Series::new(to_str!(column), vec![rnd_prime(rng)])); + series.push(Series::new(to_str!(column).into(), vec![rnd_prime(rng)])); used_cols.push(column); } diff --git a/crates/polars/tests/it/lazy/expressions/arity.rs b/crates/polars/tests/it/lazy/expressions/arity.rs index 9e0acb248acc..52ac97c56e62 100644 --- a/crates/polars/tests/it/lazy/expressions/arity.rs +++ b/crates/polars/tests/it/lazy/expressions/arity.rs @@ -197,7 +197,7 @@ fn test_update_groups_in_cast() -> PolarsResult<()> { let expected = df![ "group" => ["A" ,"B"], - "id"=> [AnyValue::List(Series::new("", [-2i64, -1])), AnyValue::List(Series::new("", [-2i64, -1, -1]))] + "id"=> [AnyValue::List(Series::new("".into(), [-2i64, -1])), AnyValue::List(Series::new("".into(), [-2i64, -1, -1]))] ]?; assert!(out.equals(&expected)); @@ -273,18 +273,18 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { .group_by([col("name")]) .agg([when(col("value").sum().eq(lit(3))) .then(col("value").rank(Default::default(), None)) - .otherwise(lit(Series::new("", &[10 as IdxSize])))]) + .otherwise(lit(Series::new("".into(), &[10 as IdxSize])))]) .sort(["name"], Default::default()) .collect()?; let out = out.column("value")?; assert_eq!( out.get(0)?, - AnyValue::List(Series::new("", &[1 as IdxSize, 2 as IdxSize])) + AnyValue::List(Series::new("".into(), &[1 as IdxSize, 2 as IdxSize])) ); assert_eq!( out.get(1)?, - AnyValue::List(Series::new("", &[10 as IdxSize, 10 as IdxSize])) + AnyValue::List(Series::new("".into(), &[10 as IdxSize, 10 as IdxSize])) ); let out = df @@ -292,7 +292,7 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { .lazy() .group_by([col("name")]) .agg([when(col("value").sum().eq(lit(3))) - .then(lit(Series::new("", &[10 as IdxSize])).alias("value")) + .then(lit(Series::new("".into(), &[10 as IdxSize])).alias("value")) .otherwise(col("value").rank(Default::default(), None))]) .sort(["name"], Default::default()) .collect()?; @@ -300,11 +300,11 @@ fn test_ternary_aggregation_set_literals() -> PolarsResult<()> { let out = out.column("value")?; assert_eq!( out.get(1)?, - AnyValue::List(Series::new("", &[1 as IdxSize, 2])) + AnyValue::List(Series::new("".into(), &[1 as IdxSize, 2])) ); assert_eq!( out.get(0)?, - AnyValue::List(Series::new("", &[10 as IdxSize, 10 as IdxSize])) + AnyValue::List(Series::new("".into(), &[10 as IdxSize, 10 as IdxSize])) ); let out = df diff --git a/crates/polars/tests/it/lazy/expressions/expand.rs b/crates/polars/tests/it/lazy/expressions/expand.rs index 27d8ee0ac1ad..69572ae0a454 100644 --- a/crates/polars/tests/it/lazy/expressions/expand.rs +++ b/crates/polars/tests/it/lazy/expressions/expand.rs @@ -13,7 +13,7 @@ fn test_expand_datetimes_3042() -> PolarsResult<()> { .and_hms_opt(0, 0, 0) .unwrap(); let date_range = polars_time::date_range( - "dt1", + "dt1".into(), low, high, Duration::parse("1w"), diff --git a/crates/polars/tests/it/lazy/expressions/is_in.rs b/crates/polars/tests/it/lazy/expressions/is_in.rs index e718b01ea032..73591af48328 100644 --- a/crates/polars/tests/it/lazy/expressions/is_in.rs +++ b/crates/polars/tests/it/lazy/expressions/is_in.rs @@ -6,7 +6,7 @@ fn test_is_in() -> PolarsResult<()> { "x" => [1, 2, 3], "y" => ["a", "b", "c"] ]?; - let s = Series::new("a", ["a", "b"]); + let s = Series::new("a".into(), ["a", "b"]); let out = df .lazy() diff --git a/crates/polars/tests/it/lazy/expressions/window.rs b/crates/polars/tests/it/lazy/expressions/window.rs index fec4a6527324..d617dd46574a 100644 --- a/crates/polars/tests/it/lazy/expressions/window.rs +++ b/crates/polars/tests/it/lazy/expressions/window.rs @@ -217,7 +217,7 @@ fn test_window_mapping() -> PolarsResult<()> { .select([(lit(10) + col("A")).alias("foo").over([col("fruits")])]) .collect()?; - let expected = Series::new("foo", [11, 12, 13, 14, 15]); + let expected = Series::new("foo".into(), [11, 12, 13, 14, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -232,7 +232,7 @@ fn test_window_mapping() -> PolarsResult<()> { .over([col("fruits")]), ]) .collect()?; - let expected = Series::new("foo", [11, 12, 8, 9, 15]); + let expected = Series::new("foo".into(), [11, 12, 8, 9, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -247,7 +247,7 @@ fn test_window_mapping() -> PolarsResult<()> { .over([col("fruits")]), ]) .collect()?; - let expected = Series::new("foo", [None, Some(3), None, Some(-1), Some(-1)]); + let expected = Series::new("foo".into(), [None, Some(3), None, Some(-1), Some(-1)]); assert!(out.column("foo")?.equals_missing(&expected)); // now sorted @@ -259,7 +259,7 @@ fn test_window_mapping() -> PolarsResult<()> { .lazy() .select([(lit(10) + col("A")).alias("foo").over([col("fruits")])]) .collect()?; - let expected = Series::new("foo", [13, 14, 11, 12, 15]); + let expected = Series::new("foo".into(), [13, 14, 11, 12, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -275,7 +275,7 @@ fn test_window_mapping() -> PolarsResult<()> { ]) .collect()?; - let expected = Series::new("foo", [8, 9, 11, 12, 15]); + let expected = Series::new("foo".into(), [8, 9, 11, 12, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -290,7 +290,7 @@ fn test_window_mapping() -> PolarsResult<()> { ]) .collect()?; - let expected = Series::new("foo", [None, Some(-1), None, Some(3), Some(-1)]); + let expected = Series::new("foo".into(), [None, Some(-1), None, Some(3), Some(-1)]); assert!(out.column("foo")?.equals_missing(&expected)); Ok(()) diff --git a/crates/polars/tests/it/lazy/exprs.rs b/crates/polars/tests/it/lazy/exprs.rs index 66ccb4a7e444..45d550ae85a1 100644 --- a/crates/polars/tests/it/lazy/exprs.rs +++ b/crates/polars/tests/it/lazy/exprs.rs @@ -7,16 +7,19 @@ fn fuzz_exprs() { use rand::Rng; let lf = DataFrame::new(vec![ - Series::new("A", vec![1, 2, 3, 4, 5]), - Series::new("B", vec![Some(5), Some(4), None, Some(2), Some(1)]), - Series::new("C", vec!["str", "", "a quite long string", "my", "string"]), + Series::new("A".into(), vec![1, 2, 3, 4, 5]), + Series::new("B".into(), vec![Some(5), Some(4), None, Some(2), Some(1)]), + Series::new( + "C".into(), + vec!["str", "", "a quite long string", "my", "string"], + ), ]) .unwrap() .lazy(); let empty = DataFrame::new(vec![ - Series::new("A", Vec::::new()), - Series::new("B", Vec::::new()), - Series::new("C", Vec::<&str>::new()), + Series::new("A".into(), Vec::::new()), + Series::new("B".into(), Vec::::new()), + Series::new("C".into(), Vec::<&str>::new()), ]) .unwrap() .lazy(); diff --git a/crates/polars/tests/it/lazy/group_by.rs b/crates/polars/tests/it/lazy/group_by.rs index 1ccb481d6ee0..ac76e4921e40 100644 --- a/crates/polars/tests/it/lazy/group_by.rs +++ b/crates/polars/tests/it/lazy/group_by.rs @@ -77,7 +77,10 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> { .collect()?; let out = out.column("diff")?; - assert_eq!(out, &Series::new("diff", &[None, Some(26), Some(6), None])); + assert_eq!( + out, + &Series::new("diff".into(), &[None, Some(26), Some(6), None]) + ); Ok(()) } @@ -120,7 +123,7 @@ fn test_group_by_agg_list_with_not_aggregated() -> PolarsResult<()> { let out = out.explode()?; assert_eq!( out, - Series::new("value", &[0, 2, 1, 3, 2, 2, 7, 2, 3, 1, 2, 1]) + Series::new("value".into(), &[0, 2, 1, 3, 2, 2, 7, 2, 3, 1, 2, 1]) ); Ok(()) } @@ -174,7 +177,7 @@ fn test_filter_aggregated_expression() -> PolarsResult<()> { assert_eq!( x.get(1).unwrap(), - AnyValue::List(Series::new("", [0, 1, 2, 3, 4])) + AnyValue::List(Series::new("".into(), [0, 1, 2, 3, 4])) ); Ok(()) } diff --git a/crates/polars/tests/it/lazy/group_by_dynamic.rs b/crates/polars/tests/it/lazy/group_by_dynamic.rs index 6c65a4041ec8..4db863551faa 100644 --- a/crates/polars/tests/it/lazy/group_by_dynamic.rs +++ b/crates/polars/tests/it/lazy/group_by_dynamic.rs @@ -22,7 +22,7 @@ fn test_group_by_dynamic_week_bounds() -> PolarsResult<()> { .and_hms_opt(0, 0, 0) .unwrap(); let range = polars_time::date_range( - "dt", + "dt".into(), start, stop, Duration::parse("1d"), @@ -32,7 +32,7 @@ fn test_group_by_dynamic_week_bounds() -> PolarsResult<()> { )? .into_series(); - let a = Int32Chunked::full("a", 1, range.len()); + let a = Int32Chunked::full("a".into(), 1, range.len()); let df = df![ "dt" => range, "a" => a diff --git a/crates/polars/tests/it/lazy/predicate_queries.rs b/crates/polars/tests/it/lazy/predicate_queries.rs index ef5580439d30..ac180917fab3 100644 --- a/crates/polars/tests/it/lazy/predicate_queries.rs +++ b/crates/polars/tests/it/lazy/predicate_queries.rs @@ -135,7 +135,7 @@ fn test_is_in_categorical_3420() -> PolarsResult<()> { disable_string_cache(); let _sc = StringCacheHolder::hold(); - let s = Series::new("x", ["a", "b", "c"]) + let s = Series::new("x".into(), ["a", "b", "c"]) .strict_cast(&DataType::Categorical(None, Default::default()))?; let out = df .lazy() diff --git a/crates/polars/tests/it/lazy/projection_queries.rs b/crates/polars/tests/it/lazy/projection_queries.rs index c5764e507250..03b7a44bc114 100644 --- a/crates/polars/tests/it/lazy/projection_queries.rs +++ b/crates/polars/tests/it/lazy/projection_queries.rs @@ -147,8 +147,8 @@ fn test_projection_5086() -> PolarsResult<()> { #[cfg(feature = "dtype-struct")] fn test_unnest_pushdown() -> PolarsResult<()> { let df = df![ - "collection" => Series::full_null("", 1, &DataType::Int32), - "users" => Series::full_null("", 1, &DataType::List(Box::new(DataType::Struct(vec![Field::new("email", DataType::String)])))), + "collection" => Series::full_null("".into(), 1, &DataType::Int32), + "users" => Series::full_null("".into(), 1, &DataType::List(Box::new(DataType::Struct(vec![Field::new("email".into(), DataType::String)])))), ]?; let out = df diff --git a/crates/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs index 8513efe0bc68..0be10b20f60e 100644 --- a/crates/polars/tests/it/lazy/queries.rs +++ b/crates/polars/tests/it/lazy/queries.rs @@ -4,7 +4,7 @@ use super::*; #[test] fn test_with_duplicate_column_empty_df() { - let a = Int32Chunked::from_slice("a", &[]); + let a = Int32Chunked::from_slice("a".into(), &[]); assert_eq!( DataFrame::new(vec![a.into_series()]) @@ -137,7 +137,7 @@ fn test_sorted_path() -> PolarsResult<()> { let payloads = &[1, 2, 3]; let df = df![ - "a"=> [AnyValue::List(Series::new("", payloads)), AnyValue::List(Series::new("", payloads)), AnyValue::List(Series::new("", payloads))] + "a"=> [AnyValue::List(Series::new("".into(), payloads)), AnyValue::List(Series::new("".into(), payloads)), AnyValue::List(Series::new("".into(), payloads))] ]?; let out = df @@ -234,11 +234,11 @@ fn test_apply_multiple_columns() -> PolarsResult<()> { #[test] fn test_group_by_on_lists() -> PolarsResult<()> { - let s0 = Series::new("", [1i32, 2, 3]); - let s1 = Series::new("groups", [4i32, 5]); + let s0 = Series::new("".into(), [1i32, 2, 3]); + let s1 = Series::new("groups".into(), [4i32, 5]); let mut builder = - ListPrimitiveChunkedBuilder::::new("arrays", 10, 10, DataType::Int32); + ListPrimitiveChunkedBuilder::::new("arrays".into(), 10, 10, DataType::Int32); builder.append_series(&s0).unwrap(); builder.append_series(&s1).unwrap(); let s2 = builder.finish().into_series(); diff --git a/crates/polars/tests/it/schema.rs b/crates/polars/tests/it/schema.rs index a636c06961b8..c791367f7546 100644 --- a/crates/polars/tests/it/schema.rs +++ b/crates/polars/tests/it/schema.rs @@ -8,9 +8,9 @@ fn test_schema_rename() { fn test_case(old: &str, new: &str, expected: Option<(&str, Vec)>) { fn make_schema() -> Schema { Schema::from_iter([ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ]) } let mut schema = make_schema(); @@ -30,9 +30,9 @@ fn test_schema_rename() { Some(( "a", vec![ - Field::new("anton", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("anton".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], )), ); @@ -43,9 +43,9 @@ fn test_schema_rename() { Some(( "b", vec![ - Field::new("a", UInt64), - Field::new("bantam", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("bantam".into(), Int32), + Field::new("c".into(), Int8), ], )), ); @@ -82,9 +82,9 @@ fn test_schema_insert_at_index() { } let schema = Schema::from_iter([ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ]); test_case( @@ -94,10 +94,10 @@ fn test_schema_insert_at_index() { ( None, vec![ - Field::new("new", String), - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("new".into(), String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], ), ); @@ -109,9 +109,9 @@ fn test_schema_insert_at_index() { ( Some(UInt64), vec![ - Field::new("a", String), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), String), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], ), ); @@ -123,9 +123,9 @@ fn test_schema_insert_at_index() { ( Some(Int32), vec![ - Field::new("b", String), - Field::new("a", UInt64), - Field::new("c", Int8), + Field::new("b".into(), String), + Field::new("a".into(), UInt64), + Field::new("c".into(), Int8), ], ), ); @@ -137,9 +137,9 @@ fn test_schema_insert_at_index() { ( Some(UInt64), vec![ - Field::new("b", Int32), - Field::new("a", String), - Field::new("c", Int8), + Field::new("b".into(), Int32), + Field::new("a".into(), String), + Field::new("c".into(), Int8), ], ), ); @@ -151,9 +151,9 @@ fn test_schema_insert_at_index() { ( Some(UInt64), vec![ - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("a", String), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("a".into(), String), ], ), ); @@ -165,9 +165,9 @@ fn test_schema_insert_at_index() { ( Some(UInt64), vec![ - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("a", String), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("a".into(), String), ], ), ); @@ -179,10 +179,10 @@ fn test_schema_insert_at_index() { ( None, vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("new", String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("new".into(), String), ], ), ); @@ -194,9 +194,9 @@ fn test_schema_insert_at_index() { ( Some(Int8), vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), String), ], ), ); @@ -208,9 +208,9 @@ fn test_schema_insert_at_index() { ( Some(Int8), vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), String), ], ), ); @@ -239,9 +239,9 @@ fn test_with_column() { } let schema = Schema::from_iter([ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ]); test_case( @@ -251,9 +251,9 @@ fn test_with_column() { ( Some(UInt64), vec![ - Field::new("a", String), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), String), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], ), ); @@ -265,9 +265,9 @@ fn test_with_column() { ( Some(Int32), vec![ - Field::new("a", UInt64), - Field::new("b", String), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), String), + Field::new("c".into(), Int8), ], ), ); @@ -279,9 +279,9 @@ fn test_with_column() { ( Some(Int8), vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), String), ], ), ); @@ -293,10 +293,10 @@ fn test_with_column() { ( None, vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("d", String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("d".into(), String), ], ), ); @@ -318,14 +318,14 @@ fn test_getters() { } let mut schema = Schema::from_iter([ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ]); test_case!(schema, get, name: "a", &UInt64); test_case!(schema, get_full, name: "a", (0, &"a".into(), &UInt64)); - test_case!(schema, get_field, name: "a", Field::new("a", UInt64)); + test_case!(schema, get_field, name: "a", Field::new("a".into(), UInt64)); test_case!(schema, get_at_index, index: 1, (&"b".into(), &Int32)); test_case!(schema, get_at_index_mut, index: 1, (&mut "b".into(), &mut Int32)); @@ -366,10 +366,10 @@ fn test_removal() { } let schema = Schema::from_iter([ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("d", Float64), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("d".into(), Float64), ]); test_case( @@ -377,14 +377,14 @@ fn test_removal() { "a", Some(UInt64), vec![ - Field::new("d", Float64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("d".into(), Float64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], vec![ - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("d", Float64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("d".into(), Float64), ], ); @@ -393,14 +393,14 @@ fn test_removal() { "b", Some(Int32), vec![ - Field::new("a", UInt64), - Field::new("d", Float64), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("d".into(), Float64), + Field::new("c".into(), Int8), ], vec![ - Field::new("a", UInt64), - Field::new("c", Int8), - Field::new("d", Float64), + Field::new("a".into(), UInt64), + Field::new("c".into(), Int8), + Field::new("d".into(), Float64), ], ); @@ -409,14 +409,14 @@ fn test_removal() { "c", Some(Int8), vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("d", Float64), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("d".into(), Float64), ], vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("d", Float64), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("d".into(), Float64), ], ); @@ -425,14 +425,14 @@ fn test_removal() { "d", Some(Float64), vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], ); @@ -441,16 +441,16 @@ fn test_removal() { "NOT_FOUND", None, vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("d", Float64), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("d".into(), Float64), ], vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), - Field::new("d", Float64), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), + Field::new("d".into(), Float64), ], ); } @@ -486,9 +486,9 @@ fn test_set_dtype() { } let schema = Schema::from_iter([ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ]); test_case( @@ -498,9 +498,9 @@ fn test_set_dtype() { ( Some(UInt64), vec![ - Field::new("a", String), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), String), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], ), ); @@ -511,9 +511,9 @@ fn test_set_dtype() { ( Some(Int32), vec![ - Field::new("a", UInt64), - Field::new("b", String), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), String), + Field::new("c".into(), Int8), ], ), ); @@ -524,9 +524,9 @@ fn test_set_dtype() { ( Some(Int8), vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", String), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), String), ], ), ); @@ -537,9 +537,9 @@ fn test_set_dtype() { ( None, vec![ - Field::new("a", UInt64), - Field::new("b", Int32), - Field::new("c", Int8), + Field::new("a".into(), UInt64), + Field::new("b".into(), Int32), + Field::new("c".into(), Int8), ], ), ); diff --git a/crates/polars/tests/it/time/date_range.rs b/crates/polars/tests/it/time/date_range.rs index ff8df835cce2..f9ab68191a8d 100644 --- a/crates/polars/tests/it/time/date_range.rs +++ b/crates/polars/tests/it/time/date_range.rs @@ -14,7 +14,7 @@ fn test_time_units_9413() { .and_hms_opt(0, 0, 0) .unwrap(); let actual = date_range( - "date", + "date".into(), start, stop, Duration::parse("1d"), @@ -35,7 +35,7 @@ Series: 'date' [datetime[ms]] ])"#; assert_eq!(result, expected); let actual = date_range( - "date", + "date".into(), start, stop, Duration::parse("1d"), @@ -56,7 +56,7 @@ Series: 'date' [datetime[μs]] ])"#; assert_eq!(result, expected); let actual = date_range( - "date", + "date".into(), start, stop, Duration::parse("1d"), diff --git a/docs/src/rust/user-guide/concepts/data-structures.rs b/docs/src/rust/user-guide/concepts/data-structures.rs index 2334f7718569..b8a4b70daa14 100644 --- a/docs/src/rust/user-guide/concepts/data-structures.rs +++ b/docs/src/rust/user-guide/concepts/data-structures.rs @@ -2,7 +2,7 @@ fn main() { // --8<-- [start:series] use polars::prelude::*; - let s = Series::new("a", &[1, 2, 3, 4, 5]); + let s = Series::new("a".into(), &[1, 2, 3, 4, 5]); println!("{}", s); // --8<-- [end:series] @@ -39,7 +39,7 @@ fn main() { // --8<-- [end:tail] // --8<-- [start:sample] - let n = Series::new("", &[2]); + let n = Series::new("".into(), &[2]); let sampled_df = df.sample_n(&n, false, false, None).unwrap(); println!("{}", sampled_df); diff --git a/docs/src/rust/user-guide/expressions/aggregation.rs b/docs/src/rust/user-guide/expressions/aggregation.rs index fe5e13a38940..90f39c9d04ad 100644 --- a/docs/src/rust/user-guide/expressions/aggregation.rs +++ b/docs/src/rust/user-guide/expressions/aggregation.rs @@ -116,7 +116,7 @@ fn main() -> Result<(), Box> { compute_age() .filter(col("gender").eq(lit(gender))) .mean() - .alias(&format!("avg {} birthday", gender)) + .alias(format!("avg {} birthday", gender)) } let df = dataset diff --git a/docs/src/rust/user-guide/expressions/casting.rs b/docs/src/rust/user-guide/expressions/casting.rs index b18ca19022df..85824afc3198 100644 --- a/docs/src/rust/user-guide/expressions/casting.rs +++ b/docs/src/rust/user-guide/expressions/casting.rs @@ -135,7 +135,7 @@ fn main() -> Result<(), Box> { use chrono::prelude::*; let date = polars::time::date_range( - "date", + "date".into(), NaiveDate::from_ymd_opt(2022, 1, 1) .unwrap() .and_hms_opt(0, 0, 0) @@ -152,7 +152,7 @@ fn main() -> Result<(), Box> { .cast(&DataType::Date)?; let datetime = polars::time::date_range( - "datetime", + "datetime".into(), NaiveDate::from_ymd_opt(2022, 1, 1) .unwrap() .and_hms_opt(0, 0, 0) @@ -185,7 +185,7 @@ fn main() -> Result<(), Box> { // --8<-- [start:dates2] let date = polars::time::date_range( - "date", + "date".into(), NaiveDate::from_ymd_opt(2022, 1, 1) .unwrap() .and_hms_opt(0, 0, 0) diff --git a/docs/src/rust/user-guide/expressions/column-selections.rs b/docs/src/rust/user-guide/expressions/column-selections.rs index f3cacebd8c0c..c0f3f35ac3b0 100644 --- a/docs/src/rust/user-guide/expressions/column-selections.rs +++ b/docs/src/rust/user-guide/expressions/column-selections.rs @@ -9,14 +9,14 @@ fn main() -> Result<(), Box> { let df = df!( "id" => &[9, 4, 2], "place" => &["Mars", "Earth", "Saturn"], - "date" => date_range("date", + "date" => date_range("date".into(), NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), Duration::parse("1d"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?, "sales" => &[33.4, 2142134.1, 44.7], "has_people" => &[false, true, false], - "logged_at" => date_range("logged_at", + "logged_at" => date_range("logged_at".into(), NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 2).unwrap(), Duration::parse("1s"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?, )? - .with_row_index("index", None)?; + .with_row_index("index".into(), None)?; println!("{}", &df); // --8<-- [end:selectors_df] diff --git a/docs/src/rust/user-guide/expressions/lists.rs b/docs/src/rust/user-guide/expressions/lists.rs index c03824c7e368..9ce160cd58aa 100644 --- a/docs/src/rust/user-guide/expressions/lists.rs +++ b/docs/src/rust/user-guide/expressions/lists.rs @@ -134,11 +134,11 @@ fn main() -> Result<(), Box> { // --8<-- [start:array_df] let mut col1: ListPrimitiveChunkedBuilder = - ListPrimitiveChunkedBuilder::new("Array_1", 8, 8, DataType::Int32); + ListPrimitiveChunkedBuilder::new("Array_1".into(), 8, 8, DataType::Int32); col1.append_slice(&[1, 3]); col1.append_slice(&[2, 5]); let mut col2: ListPrimitiveChunkedBuilder = - ListPrimitiveChunkedBuilder::new("Array_2", 8, 8, DataType::Int32); + ListPrimitiveChunkedBuilder::new("Array_2".into(), 8, 8, DataType::Int32); col2.append_slice(&[1, 7, 3]); col2.append_slice(&[8, 1, 0]); let array_df = DataFrame::new(vec![ diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index 0722b2aac5ee..25ed02daf827 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -17,7 +17,7 @@ fn main() -> Result<(), Box> { let out = ratings .clone() .lazy() - .select([col("Theatre").value_counts(true, true, "count".to_string(), false)]) + .select([col("Theatre").value_counts(true, true, "count", false)]) .collect()?; println!("{}", &out); // --8<-- [end:state_value_counts] @@ -26,7 +26,7 @@ fn main() -> Result<(), Box> { let out = ratings .clone() .lazy() - .select([col("Theatre").value_counts(true, true, "count".to_string(), false)]) + .select([col("Theatre").value_counts(true, true, "count", false)]) .unnest(["Theatre"]) .collect()?; println!("{}", &out); @@ -39,7 +39,7 @@ fn main() -> Result<(), Box> { "Theatre" => &["NE", "ME"], "Avg_Rating" => &[4.5, 4.9], )? - .into_struct("ratings") + .into_struct("ratings".into()) .into_series(); println!("{}", &rating_series); // // --8<-- [end:series_struct] @@ -54,7 +54,7 @@ fn main() -> Result<(), Box> { .lazy() .select([col("ratings") .struct_() - .rename_fields(["Film".into(), "State".into(), "Value".into()].to_vec())]) + .rename_fields(["Film", "State", "Value"].to_vec())]) .unnest(["ratings"]) .collect()?; diff --git a/docs/src/rust/user-guide/transformations/time-series/parsing.rs b/docs/src/rust/user-guide/transformations/time-series/parsing.rs index a58b5cf2850e..b12c488d0108 100644 --- a/docs/src/rust/user-guide/transformations/time-series/parsing.rs +++ b/docs/src/rust/user-guide/transformations/time-series/parsing.rs @@ -60,13 +60,13 @@ fn main() -> Result<(), Box> { Some(TimeUnit::Microseconds), None, StrptimeOptions { - format: Some("%Y-%m-%dT%H:%M:%S%z".to_string()), + format: Some("%Y-%m-%dT%H:%M:%S%z".into()), ..Default::default() }, lit("raise"), ) .dt() - .convert_time_zone("Europe/Brussels".to_string()); + .convert_time_zone("Europe/Brussels".into()); let mixed_parsed = df!("date" => &data)?.lazy().select([q]).collect()?; println!("{}", &mixed_parsed); diff --git a/docs/src/rust/user-guide/transformations/time-series/resampling.rs b/docs/src/rust/user-guide/transformations/time-series/resampling.rs index e1cd4baa1682..dec19f65fc26 100644 --- a/docs/src/rust/user-guide/transformations/time-series/resampling.rs +++ b/docs/src/rust/user-guide/transformations/time-series/resampling.rs @@ -6,7 +6,7 @@ use polars::prelude::*; fn main() -> Result<(), Box> { // --8<-- [start:df] let time = polars::time::date_range( - "time", + "time".into(), NaiveDate::from_ymd_opt(2021, 12, 16) .unwrap() .and_hms_opt(0, 0, 0) diff --git a/docs/src/rust/user-guide/transformations/time-series/rolling.rs b/docs/src/rust/user-guide/transformations/time-series/rolling.rs index 559bf0bc2fed..19b57f2d0c33 100644 --- a/docs/src/rust/user-guide/transformations/time-series/rolling.rs +++ b/docs/src/rust/user-guide/transformations/time-series/rolling.rs @@ -45,7 +45,7 @@ fn main() -> Result<(), Box> { // --8<-- [start:group_by_dyn] let time = polars::time::date_range( - "time", + "time".into(), NaiveDate::from_ymd_opt(2021, 1, 1) .unwrap() .and_hms_opt(0, 0, 0) @@ -106,7 +106,7 @@ fn main() -> Result<(), Box> { // --8<-- [start:group_by_roll] let time = polars::time::date_range( - "time", + "time".into(), NaiveDate::from_ymd_opt(2021, 12, 16) .unwrap() .and_hms_opt(0, 0, 0) diff --git a/docs/src/rust/user-guide/transformations/time-series/timezones.rs b/docs/src/rust/user-guide/transformations/time-series/timezones.rs index 4924338b4f86..489786cb844e 100644 --- a/docs/src/rust/user-guide/transformations/time-series/timezones.rs +++ b/docs/src/rust/user-guide/transformations/time-series/timezones.rs @@ -5,7 +5,7 @@ use polars::prelude::*; fn main() -> Result<(), Box> { // --8<-- [start:example] let ts = ["2021-03-27 03:00", "2021-03-28 03:00"]; - let tz_naive = Series::new("tz_naive", &ts); + let tz_naive = Series::new("tz_naive".into(), &ts); let time_zones_df = DataFrame::new(vec![tz_naive])? .lazy() .select([col("tz_naive").str().to_datetime( @@ -16,7 +16,7 @@ fn main() -> Result<(), Box> { )]) .with_columns([col("tz_naive") .dt() - .replace_time_zone(Some("UTC".to_string()), lit("raise"), NonExistent::Raise) + .replace_time_zone(Some("UTC".into()), lit("raise"), NonExistent::Raise) .alias("tz_aware")]) .collect()?; @@ -30,14 +30,14 @@ fn main() -> Result<(), Box> { col("tz_aware") .dt() .replace_time_zone( - Some("Europe/Brussels".to_string()), + Some("Europe/Brussels".into()), lit("raise"), NonExistent::Raise, ) .alias("replace time zone"), col("tz_aware") .dt() - .convert_time_zone("Asia/Kathmandu".to_string()) + .convert_time_zone("Asia/Kathmandu".into()) .alias("convert time zone"), col("tz_aware") .dt() diff --git a/docs/user-guide/io/hugging-face.md b/docs/user-guide/io/hugging-face.md index 1a94210d657b..16f705ae75fb 100644 --- a/docs/user-guide/io/hugging-face.md +++ b/docs/user-guide/io/hugging-face.md @@ -65,7 +65,7 @@ See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blo #### Parquet -{{code_block('user-guide/io/hugging-face','scan_parquet_hive',['scan_parquet'])}} +{{code_block('user-guide/io/hugging-face','scan_parquet_hive_repr',['scan_parquet'])}} ```python exec="on" result="text" session="user-guide/io/hugging-face" --8<-- "python/user-guide/io/hugging-face.py:scan_parquet_hive_repr" diff --git a/examples/datasets/tpc_heads/customer.feather b/examples/datasets/pds_heads/customer.feather similarity index 100% rename from examples/datasets/tpc_heads/customer.feather rename to examples/datasets/pds_heads/customer.feather diff --git a/examples/datasets/tpc_heads/lineitem.feather b/examples/datasets/pds_heads/lineitem.feather similarity index 100% rename from examples/datasets/tpc_heads/lineitem.feather rename to examples/datasets/pds_heads/lineitem.feather diff --git a/examples/datasets/tpc_heads/nation.feather b/examples/datasets/pds_heads/nation.feather similarity index 100% rename from examples/datasets/tpc_heads/nation.feather rename to examples/datasets/pds_heads/nation.feather diff --git a/examples/datasets/tpc_heads/orders.feather b/examples/datasets/pds_heads/orders.feather similarity index 100% rename from examples/datasets/tpc_heads/orders.feather rename to examples/datasets/pds_heads/orders.feather diff --git a/examples/datasets/tpc_heads/part.feather b/examples/datasets/pds_heads/part.feather similarity index 100% rename from examples/datasets/tpc_heads/part.feather rename to examples/datasets/pds_heads/part.feather diff --git a/examples/datasets/tpc_heads/partsupp.feather b/examples/datasets/pds_heads/partsupp.feather similarity index 100% rename from examples/datasets/tpc_heads/partsupp.feather rename to examples/datasets/pds_heads/partsupp.feather diff --git a/examples/datasets/tpc_heads/region.feather b/examples/datasets/pds_heads/region.feather similarity index 100% rename from examples/datasets/tpc_heads/region.feather rename to examples/datasets/pds_heads/region.feather diff --git a/examples/datasets/tpc_heads/supplier.feather b/examples/datasets/pds_heads/supplier.feather similarity index 100% rename from examples/datasets/tpc_heads/supplier.feather rename to examples/datasets/pds_heads/supplier.feather diff --git a/examples/read_csv/src/main.rs b/examples/read_csv/src/main.rs index aa9188f19409..877fc6483635 100644 --- a/examples/read_csv/src/main.rs +++ b/examples/read_csv/src/main.rs @@ -2,7 +2,7 @@ use polars::io::mmap::MmapBytesReader; use polars::prelude::*; fn main() -> PolarsResult<()> { - let file = std::fs::File::open("/home/ritchie46/Downloads/tpch/tables_scale_100/lineitem.tbl") + let file = std::fs::File::open("/home/ritchie46/Downloads/pdsh/tables_scale_100/lineitem.tbl") .unwrap(); let file = Box::new(file) as Box; let _df = CsvReader::new(file) diff --git a/py-polars/Makefile b/py-polars/Makefile index 7e273b14914c..3c98adab08cb 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -113,7 +113,7 @@ clean: ## Clean up caches and build artifacts @rm -rf .mypy_cache/ @rm -rf .pytest_cache/ @$(VENV_BIN)/ruff clean - @rm -rf tests/data/tpch/sf* + @rm -rf tests/data/pdsh/sf* @rm -f .coverage @rm -f coverage.xml @rm -f polars/polars.abi3.so diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 950aebc4331f..83511fbc371a 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -124,7 +124,8 @@ import numpy.typing as npt import torch from great_tables import GT - from xlsxwriter import Workbook, Worksheet + from xlsxwriter import Workbook + from xlsxwriter.worksheet import Worksheet from polars import DataType, Expr, LazyFrame, Series from polars._typing import ( diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 54c9ba55e09d..962891d7aa58 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -8746,30 +8746,31 @@ def upper_bound(self) -> Expr: def sign(self) -> Expr: """ - Compute the element-wise indication of the sign. + Compute the element-wise sign function on numeric types. - The returned values can be -1, 0, or 1: + The returned value is computed as follows: - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. + * -1 if x < 0. + * 1 if x > 0. + * x otherwise (typically 0, but could be NaN if the input is). - (null values are preserved as-is). + Null values are preserved as-is, and the dtype of the input is preserved. Examples -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, float("nan"), None]}) + >>> df.select(pl.col.a.sign()) + shape: (6, 1) ┌──────┐ │ a │ │ --- │ - │ i64 │ + │ f64 │ ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ + │ -1.0 │ + │ -0.0 │ + │ 0.0 │ + │ 1.0 │ + │ NaN │ │ null │ └──────┘ """ diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index a7138f19bfd5..8ba891d70a59 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -180,9 +180,6 @@ def cum_count(*columns: str, reverse: bool = False) -> Expr: This function is syntactic sugar for `col(columns).cum_count()`. - If no arguments are passed, returns the cumulative count of a context. - Rows containing null values count towards the result. - Parameters ---------- *columns diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 57cca5d366d3..f13efb7aa3b3 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -121,7 +121,7 @@ def next_batches(self, n: int) -> list[DataFrame] | None: Examples -------- >>> reader = pl.read_csv_batched( - ... "./tpch/tables_scale_100/lineitem.tbl", + ... "./pdsh/tables_scale_100/lineitem.tbl", ... separator="|", ... try_parse_dates=True, ... ) # doctest: +SKIP diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 7b3d3a91dbf3..3a27911d716e 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -827,7 +827,7 @@ def read_csv_batched( Examples -------- >>> reader = pl.read_csv_batched( - ... "./tpch/tables_scale_100/lineitem.tbl", + ... "./pdsh/tables_scale_100/lineitem.tbl", ... separator="|", ... try_parse_dates=True, ... ) # doctest: +SKIP diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 901e424641fa..87e0d38f0b51 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4991,27 +4991,28 @@ def mode(self) -> Series: def sign(self) -> Series: """ - Compute the element-wise indication of the sign. + Compute the element-wise sign function on numeric types. - The returned values can be -1, 0, or 1: + The returned value is computed as follows: - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. + * -1 if x < 0. + * 1 if x > 0. + * x otherwise (typically 0, but could be NaN if the input is). - (null values are preserved as-is). + Null values are preserved as-is, and the dtype of the input is preserved. Examples -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, float("nan"), None]) >>> s.sign() - shape: (5,) - Series: 'a' [i64] + shape: (6,) + Series: 'a' [f64] [ - -1 - 0 - 0 - 1 + -1.0 + -0.0 + 0.0 + 1.0 + NaN null ] """ diff --git a/py-polars/polars/testing/asserts/frame.py b/py-polars/polars/testing/asserts/frame.py index 5d6112b6cb08..800289d2952e 100644 --- a/py-polars/polars/testing/asserts/frame.py +++ b/py-polars/polars/testing/asserts/frame.py @@ -257,6 +257,7 @@ def assert_frame_not_equal( """ __tracebackhide__ = True + _assert_correct_input_type(left, right) try: assert_frame_equal( left=left, @@ -272,5 +273,5 @@ def assert_frame_not_equal( except AssertionError: return else: - msg = "frames are equal" + msg = "frames are equal (but are expected not to be)" raise AssertionError(msg) diff --git a/py-polars/polars/testing/asserts/series.py b/py-polars/polars/testing/asserts/series.py index ad316f565aad..65e5169cab74 100644 --- a/py-polars/polars/testing/asserts/series.py +++ b/py-polars/polars/testing/asserts/series.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from polars._utils.deprecation import deprecate_renamed_parameter from polars.datatypes import ( @@ -20,6 +20,19 @@ from polars import DataType +def _assert_correct_input_type(left: Any, right: Any) -> bool: + __tracebackhide__ = True + + if not (isinstance(left, Series) and isinstance(right, Series)): + raise_assertion_error( + "inputs", + "unexpected input types", + type(left).__name__, + type(right).__name__, + ) + return True + + @deprecate_renamed_parameter("check_dtype", "check_dtypes", version="0.20.31") def assert_series_equal( left: Series, @@ -90,13 +103,7 @@ def assert_series_equal( """ __tracebackhide__ = True - if not (isinstance(left, Series) and isinstance(right, Series)): # type: ignore[redundant-expr] - raise_assertion_error( - "inputs", - "unexpected input types", - type(left).__name__, - type(right).__name__, - ) + _assert_correct_input_type(left, right) if left.len() != right.len(): raise_assertion_error("Series", "length mismatch", left.len(), right.len()) @@ -404,6 +411,7 @@ def assert_series_not_equal( """ __tracebackhide__ = True + _assert_correct_input_type(left, right) try: assert_series_equal( left=left, @@ -419,5 +427,5 @@ def assert_series_not_equal( except AssertionError: return else: - msg = "Series are equal" + msg = "Series are equal (but are expected not to be)" raise AssertionError(msg) diff --git a/py-polars/tests/benchmark/data/__init__.py b/py-polars/tests/benchmark/data/__init__.py index b7f246f37abc..255752458b72 100644 --- a/py-polars/tests/benchmark/data/__init__.py +++ b/py-polars/tests/benchmark/data/__init__.py @@ -1,6 +1,6 @@ """Data generation functionality for use in the benchmarking suite.""" from tests.benchmark.data.h2oai import generate_group_by_data -from tests.benchmark.data.tpch import load_tpch_table +from tests.benchmark.data.pdsh import load_pdsh_table -__all__ = ["load_tpch_table", "generate_group_by_data"] +__all__ = ["load_pdsh_table", "generate_group_by_data"] diff --git a/py-polars/tests/benchmark/data/pdsh/__init__.py b/py-polars/tests/benchmark/data/pdsh/__init__.py new file mode 100644 index 000000000000..ef007f5ed8d9 --- /dev/null +++ b/py-polars/tests/benchmark/data/pdsh/__init__.py @@ -0,0 +1,5 @@ +"""Generate data for the PDS-H benchmark tests.""" + +from tests.benchmark.data.pdsh.generate_data import load_pdsh_table + +__all__ = ["load_pdsh_table"] diff --git a/py-polars/tests/benchmark/data/tpch/dbgen/dbgen b/py-polars/tests/benchmark/data/pdsh/dbgen/dbgen similarity index 100% rename from py-polars/tests/benchmark/data/tpch/dbgen/dbgen rename to py-polars/tests/benchmark/data/pdsh/dbgen/dbgen diff --git a/py-polars/tests/benchmark/data/tpch/dbgen/dists.dss b/py-polars/tests/benchmark/data/pdsh/dbgen/dists.dss similarity index 100% rename from py-polars/tests/benchmark/data/tpch/dbgen/dists.dss rename to py-polars/tests/benchmark/data/pdsh/dbgen/dists.dss diff --git a/py-polars/tests/benchmark/data/tpch/generate_data.py b/py-polars/tests/benchmark/data/pdsh/generate_data.py similarity index 73% rename from py-polars/tests/benchmark/data/tpch/generate_data.py rename to py-polars/tests/benchmark/data/pdsh/generate_data.py index 3b4d81be51ff..0d54f914bb28 100644 --- a/py-polars/tests/benchmark/data/tpch/generate_data.py +++ b/py-polars/tests/benchmark/data/pdsh/generate_data.py @@ -1,8 +1,19 @@ """ -Script to generate data for running the TPC-H benchmark. - -Data generation logic was adapted from the TPC-H benchmark tools: -https://www.tpc.org/tpch/ +Disclaimer. + +Certain portions of the contents of this file are derived from TPC-H version 3.0.1 +(retrieved from +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp). +Such portions are subject to copyrights held by Transaction Processing +Performance Council (“TPC”) and licensed under the TPC EULA is available at +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) +(the “TPC EULA”). + +You may not use this file except in compliance with the TPC EULA. +DISCLAIMER: Portions of this file is derived from the TPC-H benchmark and as +such any result obtained using this file are not comparable to published TPC-H +Benchmark results, as the results obtained from using this file do not comply with +the TPC-H Benchmark. """ from __future__ import annotations @@ -19,12 +30,12 @@ CURRENT_DIR = Path(__file__).parent DBGEN_DIR = CURRENT_DIR / "dbgen" -__all__ = ["load_tpch_table"] +__all__ = ["load_pdsh_table"] -def load_tpch_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame: +def load_pdsh_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame: """ - Load a TPC-H table from disk. + Load PDS-H table from disk. If the file does not exist, it is generated along with all other tables. """ @@ -32,16 +43,16 @@ def load_tpch_table(table_name: str, scale_factor: float = 0.01) -> pl.DataFrame file_path = folder / f"{table_name}.parquet" if not file_path.exists(): - _generate_tpch_data(scale_factor) + _generate_pdsh_data(scale_factor) return pl.read_parquet(file_path) -def _generate_tpch_data(scale_factor: float = 0.01) -> None: - """Generate all TPC-H datasets with the given scale factor.""" +def _generate_pdsh_data(scale_factor: float = 0.01) -> None: + """Generate all PDS-H datasets with the given scale factor.""" # TODO: Can we make this work under Windows? if sys.platform == "win32": - msg = "cannot generate TPC-H data under Windows" + msg = "cannot generate PDS-H data under Windows" raise RuntimeError(msg) subprocess.run(["./dbgen", "-f", "-v", "-s", str(scale_factor)], cwd=DBGEN_DIR) diff --git a/py-polars/tests/benchmark/data/tpch/__init__.py b/py-polars/tests/benchmark/data/tpch/__init__.py index 2973049f3fbd..ef007f5ed8d9 100644 --- a/py-polars/tests/benchmark/data/tpch/__init__.py +++ b/py-polars/tests/benchmark/data/tpch/__init__.py @@ -1,5 +1,5 @@ -"""Generate data for the TPC-H benchmark tests.""" +"""Generate data for the PDS-H benchmark tests.""" -from tests.benchmark.data.tpch.generate_data import load_tpch_table +from tests.benchmark.data.pdsh.generate_data import load_pdsh_table -__all__ = ["load_tpch_table"] +__all__ = ["load_pdsh_table"] diff --git a/py-polars/tests/benchmark/test_tpch.py b/py-polars/tests/benchmark/test_pdsh.py similarity index 91% rename from py-polars/tests/benchmark/test_tpch.py rename to py-polars/tests/benchmark/test_pdsh.py index bd09d92161c9..2ee601b5b895 100644 --- a/py-polars/tests/benchmark/test_tpch.py +++ b/py-polars/tests/benchmark/test_pdsh.py @@ -1,58 +1,76 @@ +""" +Disclaimer. + +Certain portions of the contents of this file are derived from TPC-H version 3.0.1 +(retrieved from +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp). +Such portions are subject to copyrights held by Transaction Processing +Performance Council (“TPC”) and licensed under the TPC EULA is available at +http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) +(the “TPC EULA”). + +You may not use this file except in compliance with the TPC EULA. +DISCLAIMER: Portions of this file is derived from the TPC-H benchmark and as +such any result obtained using this file are not comparable to published TPC-H +Benchmark results, as the results obtained from using this file do not comply with +the TPC-H Benchmark. +""" + import sys from datetime import date import pytest import polars as pl -from tests.benchmark.data import load_tpch_table +from tests.benchmark.data import load_pdsh_table if sys.platform == "win32": - pytest.skip("TPC-H data cannot be generated under Windows", allow_module_level=True) + pytest.skip("PDS-H data cannot be generated under Windows", allow_module_level=True) pytestmark = pytest.mark.benchmark() @pytest.fixture(scope="module") def customer() -> pl.LazyFrame: - return load_tpch_table("customer").lazy() + return load_pdsh_table("customer").lazy() @pytest.fixture(scope="module") def lineitem() -> pl.LazyFrame: - return load_tpch_table("lineitem").lazy() + return load_pdsh_table("lineitem").lazy() @pytest.fixture(scope="module") def nation() -> pl.LazyFrame: - return load_tpch_table("nation").lazy() + return load_pdsh_table("nation").lazy() @pytest.fixture(scope="module") def orders() -> pl.LazyFrame: - return load_tpch_table("orders").lazy() + return load_pdsh_table("orders").lazy() @pytest.fixture(scope="module") def part() -> pl.LazyFrame: - return load_tpch_table("part").lazy() + return load_pdsh_table("part").lazy() @pytest.fixture(scope="module") def partsupp() -> pl.LazyFrame: - return load_tpch_table("partsupp").lazy() + return load_pdsh_table("partsupp").lazy() @pytest.fixture(scope="module") def region() -> pl.LazyFrame: - return load_tpch_table("region").lazy() + return load_pdsh_table("region").lazy() @pytest.fixture(scope="module") def supplier() -> pl.LazyFrame: - return load_tpch_table("supplier").lazy() + return load_pdsh_table("supplier").lazy() -def test_tpch_q1(lineitem: pl.LazyFrame) -> None: +def test_pdsh_q1(lineitem: pl.LazyFrame) -> None: var1 = date(1998, 9, 2) q_final = ( @@ -81,7 +99,7 @@ def test_tpch_q1(lineitem: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q2( +def test_pdsh_q2( nation: pl.LazyFrame, part: pl.LazyFrame, partsupp: pl.LazyFrame, @@ -125,7 +143,7 @@ def test_tpch_q2( q_final.collect() -def test_tpch_q3( +def test_pdsh_q3( customer: pl.LazyFrame, lineitem: pl.LazyFrame, orders: pl.LazyFrame ) -> None: var1 = "BUILDING" @@ -154,7 +172,7 @@ def test_tpch_q3( q_final.collect() -def test_tpch_q4(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q4(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: var1 = date(1993, 7, 1) var2 = date(1993, 10, 1) @@ -170,7 +188,7 @@ def test_tpch_q4(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q5( +def test_pdsh_q5( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -205,7 +223,7 @@ def test_tpch_q5( q_final.collect() -def test_tpch_q6(lineitem: pl.LazyFrame) -> None: +def test_pdsh_q6(lineitem: pl.LazyFrame) -> None: var1 = date(1994, 1, 1) var2 = date(1995, 1, 1) var3 = 0.05 @@ -225,7 +243,7 @@ def test_tpch_q6(lineitem: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q7( +def test_pdsh_q7( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -274,7 +292,7 @@ def test_tpch_q7( q_final.collect() -def test_tpch_q8( +def test_pdsh_q8( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -322,7 +340,7 @@ def test_tpch_q8( q_final.collect() -def test_tpch_q9( +def test_pdsh_q9( lineitem: pl.LazyFrame, nation: pl.LazyFrame, orders: pl.LazyFrame, @@ -357,7 +375,7 @@ def test_tpch_q9( q_final.collect() -def test_tpch_q10( +def test_pdsh_q10( customer: pl.LazyFrame, lineitem: pl.LazyFrame, nation: pl.LazyFrame, @@ -404,7 +422,7 @@ def test_tpch_q10( q_final.collect() -def test_tpch_q11( +def test_pdsh_q11( nation: pl.LazyFrame, partsupp: pl.LazyFrame, supplier: pl.LazyFrame ) -> None: var1 = "GERMANY" @@ -438,7 +456,7 @@ def test_tpch_q11( q_final.collect() -def test_tpch_q12(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q12(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: var1 = "MAIL" var2 = "SHIP" var3 = date(1994, 1, 1) @@ -467,7 +485,7 @@ def test_tpch_q12(lineitem: pl.LazyFrame, orders: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q13(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q13(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: var1 = "special" var2 = "requests" @@ -484,7 +502,7 @@ def test_tpch_q13(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q14(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: +def test_pdsh_q14(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: var1 = date(1995, 9, 1) var2 = date(1995, 10, 1) @@ -507,7 +525,7 @@ def test_tpch_q14(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q15(lineitem: pl.LazyFrame, supplier: pl.LazyFrame) -> None: +def test_pdsh_q15(lineitem: pl.LazyFrame, supplier: pl.LazyFrame) -> None: var1 = date(1996, 1, 1) var2 = date(1996, 4, 1) @@ -532,7 +550,7 @@ def test_tpch_q15(lineitem: pl.LazyFrame, supplier: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q16( +def test_pdsh_q16( part: pl.LazyFrame, partsupp: pl.LazyFrame, supplier: pl.LazyFrame ) -> None: var1 = "Brand#45" @@ -558,7 +576,7 @@ def test_tpch_q16( q_final.collect() -def test_tpch_q17(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: +def test_pdsh_q17(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: var1 = "Brand#23" var2 = "MED BOX" @@ -579,7 +597,7 @@ def test_tpch_q17(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q18( +def test_pdsh_q18( customer: pl.LazyFrame, lineitem: pl.LazyFrame, orders: pl.LazyFrame ) -> None: var1 = 300 @@ -608,7 +626,7 @@ def test_tpch_q18( q_final.collect() -def test_tpch_q19(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: +def test_pdsh_q19(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final = ( part.join(lineitem, left_on="p_partkey", right_on="l_partkey") .filter(pl.col("l_shipmode").is_in(["AIR", "AIR REG"])) @@ -649,7 +667,7 @@ def test_tpch_q19(lineitem: pl.LazyFrame, part: pl.LazyFrame) -> None: q_final.collect() -def test_tpch_q20( +def test_pdsh_q20( lineitem: pl.LazyFrame, nation: pl.LazyFrame, part: pl.LazyFrame, @@ -687,7 +705,7 @@ def test_tpch_q20( q_final.collect() -def test_tpch_q21( +def test_pdsh_q21( lineitem: pl.LazyFrame, nation: pl.LazyFrame, orders: pl.LazyFrame, @@ -723,7 +741,7 @@ def test_tpch_q21( q_final.collect() -def test_tpch_q22(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: +def test_pdsh_q22(customer: pl.LazyFrame, orders: pl.LazyFrame) -> None: q1 = ( customer.with_columns(pl.col("c_phone").str.slice(0, 2).alias("cntrycode")) .filter(pl.col("cntrycode").str.contains("13|31|23|29|30|18|17")) diff --git a/py-polars/tests/unit/constructors/test_constructors.py b/py-polars/tests/unit/constructors/test_constructors.py index ffda370aa538..fda072930525 100644 --- a/py-polars/tests/unit/constructors/test_constructors.py +++ b/py-polars/tests/unit/constructors/test_constructors.py @@ -1677,6 +1677,7 @@ def __arrow_c_array__(self, requested_schema: object = None) -> object: def test_pycapsule_interface(df: pl.DataFrame) -> None: + df = df.rechunk() pyarrow_table = df.to_arrow() # Array via C data interface diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 1de1e08e74d4..ea1798fe7114 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -2294,7 +2294,7 @@ def test_weekday_vs_stdlib_datetime( ) -> None: result = ( pl.Series([value], dtype=pl.Datetime(time_unit)) - .dt.replace_time_zone(time_zone, non_existent="null") + .dt.replace_time_zone(time_zone, non_existent="null", ambiguous="null") .dt.weekday() .item() ) diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index c61b07ce88c4..2fd7d2ab4193 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1715,6 +1715,35 @@ def test_parametric_small_page_mask_filtering( assert_frame_equal(result, df.filter(expr)) +@pytest.mark.parametrize( + "value", + [ + "abcd", + 0, + 0.0, + False, + ], +) +def test_different_page_validity_across_pages(value: str | int | float | bool) -> None: + df = pl.DataFrame( + { + "a": [None] + [value] * 4000, + } + ) + + f = io.BytesIO() + pq.write_table( + df.to_arrow(), + f, + use_dictionary=False, + data_page_size=1024, + column_encoding={"a": "PLAIN"}, + ) + + f.seek(0) + assert_frame_equal(df, pl.read_parquet(f)) + + @given( df=dataframes( min_size=0, diff --git a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py index 4c6877d08694..57a8ce795dc1 100644 --- a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py +++ b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py @@ -343,10 +343,6 @@ def test_parse_apply_raw_functions() -> None: ): df1 = lf.select(pl.col("a").map_elements(func)).collect() df2 = lf.select(getattr(pl.col("a"), func_name)()).collect() - if func_name == "sign": - # note: Polars' 'sign' function returns an Int64, while numpy's - # 'sign' function returns a Float64 - df1 = df1.with_columns(pl.col("a").cast(pl.Int64)) assert_frame_equal(df1, df2) # test bare 'json.loads' diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 24e65ac1dc6d..14aefa93c3c1 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -167,7 +167,7 @@ def test_list_struct_explode_6905() -> None: }, schema={"group": pl.List(pl.Struct([pl.Field("params", pl.List(pl.Int32))]))}, )["group"].list.explode().to_list() == [ - {"params": None}, + None, {"params": [1]}, {"params": []}, ] @@ -447,3 +447,8 @@ def test_explode_17648() -> None: .with_columns(pl.int_ranges(pl.col("a").list.len()).alias("count")) .explode("a", "count") ).to_dict(as_series=False) == {"a": [2, 6, 7, 3, 9, 2], "count": [0, 1, 2, 0, 1, 2]} + + +def test_explode_struct_nulls() -> None: + df = pl.DataFrame({"A": [[{"B": 1}], [None], []]}) + assert df.explode("A").to_dict(as_series=False) == {"A": [{"B": 1}, None, None]} diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 08884308af48..3f7f159ccae0 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -1747,8 +1747,8 @@ def test_sign() -> None: assert_series_equal(a.sign(), expected) # Floats - a = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - expected = pl.Series("a", [-1, 0, 0, 1, None]) + a = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, float("nan"), None]) + expected = pl.Series("a", [-1.0, 0.0, 0.0, 1.0, float("nan"), None]) assert_series_equal(a.sign(), expected) # Invalid input diff --git a/py-polars/tests/unit/sql/test_operators.py b/py-polars/tests/unit/sql/test_operators.py index 668ead0bc087..278e0776f2d7 100644 --- a/py-polars/tests/unit/sql/test_operators.py +++ b/py-polars/tests/unit/sql/test_operators.py @@ -37,7 +37,7 @@ def test_div() -> None: [ [-0.0995024875621891, 2.85714285714286, 12.0, None, -15.92356687898089], [-1, 2, 12, None, -16], - [-1, 1, 1, None, -1], + [-1.0, 1.0, 1.0, None, -1.0], ], schema=["a_div_b", "a_floordiv_b", "b_sign"], ), diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index a330cda1b8a9..5a519dc94e2c 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -783,3 +783,15 @@ def test_cse_chunks_18124() -> None: ) .filter(pl.col("ts_diff") > 1) ).collect().shape == (4, 4) + + +def test_eager_cse_during_struct_expansion_18411() -> None: + df = pl.DataFrame({"foo": [0, 0, 0, 1, 1]}) + vc = pl.col("foo").value_counts() + classes = vc.struct[0] + counts = vc.struct[1] + # Check if output is stable + assert ( + df.select(pl.col("foo").replace(classes, counts)) + == df.select(pl.col("foo").replace(classes, counts)) + )["foo"].all() diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index 9586bfb0a2ae..700100ced4c4 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -78,11 +78,19 @@ def test_unnest_projection_pushdown() -> None: pl.col("field_2").cast(pl.Categorical).alias("col"), pl.col("value"), ) - out = mlf.collect().to_dict(as_series=False) + + out = ( + mlf.sort( + [pl.col.row.cast(pl.String), pl.col.col.cast(pl.String)], + maintain_order=True, + ) + .collect() + .to_dict(as_series=False) + ) assert out == { - "row": ["y", "y", "b", "b"], - "col": ["z", "z", "c", "c"], - "value": [1, 2, 2, 3], + "row": ["b", "b", "y", "y"], + "col": ["c", "c", "z", "z"], + "value": [2, 3, 1, 2], } diff --git a/py-polars/tests/unit/testing/test_assert_frame_equal.py b/py-polars/tests/unit/testing/test_assert_frame_equal.py index 4cb8b3f5106f..f6a1c9f192cd 100644 --- a/py-polars/tests/unit/testing/test_assert_frame_equal.py +++ b/py-polars/tests/unit/testing/test_assert_frame_equal.py @@ -278,13 +278,17 @@ def test_assert_frame_equal_pass() -> None: assert_frame_equal(df1, df2) -def test_assert_frame_equal_types() -> None: +@pytest.mark.parametrize( + "assert_function", + [assert_frame_equal, assert_frame_not_equal], +) +def test_assert_frame_equal_types(assert_function: Any) -> None: df1 = pl.DataFrame({"a": [1, 2]}) srs1 = pl.Series(values=[1, 2], name="a") with pytest.raises( AssertionError, match=r"inputs are different \(unexpected input types\)" ): - assert_frame_equal(df1, srs1) # type: ignore[arg-type] + assert_function(df1, srs1) def test_assert_frame_equal_length_mismatch() -> None: @@ -295,6 +299,7 @@ def test_assert_frame_equal_length_mismatch() -> None: match=r"DataFrames are different \(number of rows does not match\)", ): assert_frame_equal(df1, df2) + assert_frame_not_equal(df1, df2) def test_assert_frame_equal_column_mismatch() -> None: @@ -304,6 +309,7 @@ def test_assert_frame_equal_column_mismatch() -> None: AssertionError, match="columns \\['a'\\] in left DataFrame, but not in right" ): assert_frame_equal(df1, df2) + assert_frame_not_equal(df1, df2) def test_assert_frame_equal_column_mismatch2() -> None: @@ -314,6 +320,7 @@ def test_assert_frame_equal_column_mismatch2() -> None: match="columns \\['b', 'c'\\] in right LazyFrame, but not in left", ): assert_frame_equal(df1, df2) + assert_frame_not_equal(df1, df2) def test_assert_frame_equal_column_mismatch_order() -> None: @@ -323,6 +330,7 @@ def test_assert_frame_equal_column_mismatch_order() -> None: assert_frame_equal(df1, df2) assert_frame_equal(df1, df2, check_column_order=False) + assert_frame_not_equal(df1, df2) def test_assert_frame_equal_check_row_order() -> None: @@ -331,25 +339,33 @@ def test_assert_frame_equal_check_row_order() -> None: with pytest.raises(AssertionError, match="value mismatch for column 'a'"): assert_frame_equal(df1, df2) + assert_frame_equal(df1, df2, check_row_order=False) + assert_frame_not_equal(df1, df2) def test_assert_frame_equal_check_row_col_order() -> None: df1 = pl.DataFrame({"a": [1, 2], "b": [4, 3]}) - df3 = pl.DataFrame({"b": [3, 4], "a": [2, 1]}) + df2 = pl.DataFrame({"b": [3, 4], "a": [2, 1]}) with pytest.raises(AssertionError, match="columns are not in the same order"): - assert_frame_equal(df1, df3, check_row_order=False) - assert_frame_equal(df1, df3, check_row_order=False, check_column_order=False) + assert_frame_equal(df1, df2, check_row_order=False) + + assert_frame_equal(df1, df2, check_row_order=False, check_column_order=False) + assert_frame_not_equal(df1, df2) -def test_assert_frame_equal_check_row_order_unsortable() -> None: +@pytest.mark.parametrize( + "assert_function", + [assert_frame_equal, assert_frame_not_equal], +) +def test_assert_frame_equal_check_row_order_unsortable(assert_function: Any) -> None: df1 = pl.DataFrame({"a": [object(), object()], "b": [3, 4]}) df2 = pl.DataFrame({"a": [object(), object()], "b": [4, 3]}) with pytest.raises( TypeError, match="cannot set `check_row_order=False`.*unsortable columns" ): - assert_frame_equal(df1, df2, check_row_order=False) + assert_function(df1, df2, check_row_order=False) def test_assert_frame_equal_dtypes_mismatch() -> None: @@ -360,6 +376,9 @@ def test_assert_frame_equal_dtypes_mismatch() -> None: with pytest.raises(AssertionError, match="dtypes do not match"): assert_frame_equal(df1, df2, check_column_order=False) + assert_frame_not_equal(df1, df2, check_column_order=False) + assert_frame_not_equal(df1, df2) + def test_assert_frame_not_equal() -> None: df = pl.DataFrame({"a": [1, 2]}) diff --git a/py-polars/tests/unit/testing/test_assert_series_equal.py b/py-polars/tests/unit/testing/test_assert_series_equal.py index 92ebe13a0104..c523fe193a30 100644 --- a/py-polars/tests/unit/testing/test_assert_series_equal.py +++ b/py-polars/tests/unit/testing/test_assert_series_equal.py @@ -35,10 +35,11 @@ def test_assert_series_equal_parametric_array(data: st.DataObject) -> None: def test_compare_series_value_mismatch() -> None: srs1 = pl.Series([1, 2, 3]) srs2 = pl.Series([2, 3, 4]) - assert_series_not_equal(srs1, srs2) + with pytest.raises( - AssertionError, match=r"Series are different \(exact value mismatch\)" + AssertionError, + match=r"Series are different \(exact value mismatch\)", ): assert_series_equal(srs1, srs2) @@ -46,25 +47,33 @@ def test_compare_series_value_mismatch() -> None: def test_compare_series_empty_equal() -> None: srs1 = pl.Series([]) srs2 = pl.Series(()) - assert_series_equal(srs1, srs2) - with pytest.raises(AssertionError): + + with pytest.raises( + AssertionError, + match=r"Series are equal \(but are expected not to be\)", + ): assert_series_not_equal(srs1, srs2) def test_assert_series_equal_check_order() -> None: srs1 = pl.Series([1, 2, 3, None]) srs2 = pl.Series([2, None, 3, 1]) - assert_series_equal(srs1, srs2, check_order=False) - with pytest.raises(AssertionError): + + with pytest.raises( + AssertionError, + match=r"Series are equal \(but are expected not to be\)", + ): assert_series_not_equal(srs1, srs2, check_order=False) def test_assert_series_equal_check_order_unsortable_type() -> None: s = pl.Series([object(), object()]) - - with pytest.raises(TypeError): + with pytest.raises( + TypeError, + match="cannot set `check_order=False` on Series with unsortable data type", + ): assert_series_equal(s, s, check_order=False) @@ -123,32 +132,45 @@ def test_compare_series_value_mismatch_string() -> None: assert_series_not_equal(srs1, srs2) with pytest.raises( - AssertionError, match=r"Series are different \(exact value mismatch\)" + AssertionError, + match=r"Series are different \(exact value mismatch\)", ): assert_series_equal(srs1, srs2) -def test_compare_series_type_mismatch() -> None: +def test_compare_series_dtype_mismatch() -> None: srs1 = pl.Series([1, 2, 3]) - srs2 = pl.DataFrame({"col1": [2, 3, 4]}) + srs2 = pl.Series([1.0, 2.0, 3.0]) + assert_series_not_equal(srs1, srs2) with pytest.raises( - AssertionError, match=r"inputs are different \(unexpected input types\)" + AssertionError, + match=r"Series are different \(dtype mismatch\)", ): - assert_series_equal(srs1, srs2) # type: ignore[arg-type] + assert_series_equal(srs1, srs2) + + +@pytest.mark.parametrize( + "assert_function", [assert_series_equal, assert_series_not_equal] +) +def test_compare_series_input_type_mismatch(assert_function: Any) -> None: + srs1 = pl.Series([1, 2, 3]) + srs2 = pl.DataFrame({"col1": [2, 3, 4]}) - srs3 = pl.Series([1.0, 2.0, 3.0]) - assert_series_not_equal(srs1, srs3) with pytest.raises( - AssertionError, match=r"Series are different \(dtype mismatch\)" + AssertionError, + match=r"inputs are different \(unexpected input types\)", ): - assert_series_equal(srs1, srs3) + assert_function(srs1, srs2) def test_compare_series_name_mismatch() -> None: srs1 = pl.Series(values=[1, 2, 3], name="srs1") srs2 = pl.Series(values=[1, 2, 3], name="srs2") - with pytest.raises(AssertionError, match=r"Series are different \(name mismatch\)"): + with pytest.raises( + AssertionError, + match=r"Series are different \(name mismatch\)", + ): assert_series_equal(srs1, srs2) @@ -158,7 +180,8 @@ def test_compare_series_length_mismatch() -> None: assert_series_not_equal(srs1, srs2) with pytest.raises( - AssertionError, match=r"Series are different \(length mismatch\)" + AssertionError, + match=r"Series are different \(length mismatch\)", ): assert_series_equal(srs1, srs2) @@ -167,7 +190,8 @@ def test_compare_series_value_exact_mismatch() -> None: srs1 = pl.Series([1.0, 2.0, 3.0]) srs2 = pl.Series([1.0, 2.0 + 1e-7, 3.0]) with pytest.raises( - AssertionError, match=r"Series are different \(exact value mismatch\)" + AssertionError, + match=r"Series are different \(exact value mismatch\)", ): assert_series_equal(srs1, srs2, check_exact=True) @@ -537,7 +561,10 @@ def test_assert_series_equal_full_series() -> None: def test_assert_series_not_equal() -> None: s = pl.Series("a", [1, 2]) - with pytest.raises(AssertionError, match="Series are equal"): + with pytest.raises( + AssertionError, + match=r"Series are equal \(but are expected not to be\)", + ): assert_series_not_equal(s, s) @@ -546,7 +573,10 @@ def test_assert_series_equal_nested_list_float() -> None: s1 = pl.Series([[1.0, 2.0], [3.0, 4.0]], dtype=pl.List(pl.Float64)) s2 = pl.Series([[1.0, 2.0], [3.0, 4.9]], dtype=pl.List(pl.Float64)) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, + match=r"Series are different \(nested value mismatch\)", + ): assert_series_equal(s1, s2) @@ -560,7 +590,10 @@ def test_assert_series_equal_nested_struct_float() -> None: dtype=pl.Struct({"a": pl.Float64, "b": pl.Float64}), ) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, + match=r"Series are different \(nested value mismatch\)", + ): assert_series_equal(s1, s2) @@ -570,7 +603,10 @@ def test_assert_series_equal_full_null_incompatible_dtypes_raises() -> None: # You could argue this should pass, but it's rare enough not to warrant the # additional check - with pytest.raises(AssertionError, match="incompatible data types"): + with pytest.raises( + AssertionError, + match="incompatible data types", + ): assert_series_equal(s1, s2, check_dtypes=False) @@ -595,9 +631,16 @@ def test_assert_series_equal_uint_overflow() -> None: s1 = pl.Series([1, 2, 3], dtype=pl.UInt8) s2 = pl.Series([2, 3, 4], dtype=pl.UInt8) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, + match=r"Series are different \(exact value mismatch\)", + ): assert_series_equal(s1, s2, atol=0) - with pytest.raises(AssertionError): + + with pytest.raises( + AssertionError, + match=r"Series are different \(exact value mismatch\)", + ): assert_series_equal(s1, s2, atol=1) left = pl.Series( @@ -616,7 +659,10 @@ def test_assert_series_equal_uint_always_checked_exactly() -> None: s1 = pl.Series([1, 3], dtype=pl.UInt8) s2 = pl.Series([2, 4], dtype=pl.Int64) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, + match=r"Series are different \(exact value mismatch\)", + ): assert_series_equal(s1, s2, atol=1, check_dtypes=False) @@ -624,9 +670,15 @@ def test_assert_series_equal_nested_int_always_checked_exactly() -> None: s1 = pl.Series([[1, 2], [3, 4]]) s2 = pl.Series([[1, 2], [3, 5]]) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, + match=r"Series are different \(exact value mismatch\)", + ): assert_series_equal(s1, s2, atol=1) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, + match=r"Series are different \(exact value mismatch\)", + ): assert_series_equal(s1, s2, check_exact=True) @@ -635,7 +687,9 @@ def test_assert_series_equal_array_equal(check_exact: bool) -> None: s1 = pl.Series([[1.0, 2.0], [3.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) s2 = pl.Series([[1.0, 2.0], [3.0, 4.2]], dtype=pl.Array(pl.Float64, 2)) - with pytest.raises(AssertionError): + with pytest.raises( + AssertionError, match=r"Series are different \(nested value mismatch\)" + ): assert_series_equal(s1, s2, check_exact=check_exact)