From 53e0d1669bdabef4c74f5553273fddfd50619f18 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Thu, 26 Sep 2024 17:59:01 -0700 Subject: [PATCH 1/9] Add initial version of array function. --- crates/polars-plan/Cargo.toml | 2 +- .../src/dsl/function_expr/array.rs | 135 ++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 7edc15ea8616..d5676fecdf8d 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -41,7 +41,7 @@ rayon = { workspace = true } recursive = { workspace = true } regex = { workspace = true, optional = true } serde = { workspace = true, features = ["rc"], optional = true } -serde_json = { workspace = true, optional = true } +serde_json = { workspace = true, optional = false } strum_macros = { workspace = true } [build-dependencies] diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index dce6d44bce94..99a72fec46f7 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -2,6 +2,19 @@ use polars_ops::chunked_array::array::*; use super::*; use crate::{map, map_as_slice}; +use polars_core::with_match_physical_numeric_polars_type; +use std::collections::HashMap; +use arrow::bitmap::MutableBitmap; +use arrow::array::{PrimitiveArray, FixedSizeListArray}; + + + +#[derive(Clone, Deserialize)] +struct ArrayKwargs { + // I guess DataType is not one of the serializable types? + // In the source code I see this done vie Wrap + dtype_expr: String, +} #[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -19,6 +32,7 @@ pub enum ArrayFunction { Any, #[cfg(feature = "array_any_all")] All, + Array, Sort(SortOptions), Reverse, ArgMin, @@ -46,6 +60,8 @@ impl ArrayFunction { Median => mapper.map_to_float_dtype(), #[cfg(feature = "array_any_all")] Any | All => mapper.with_dtype(DataType::Boolean), + // TODO: Figure out how to bind keyword argument + Array => array_output_type(mapper.args(), ArrayKwargs { dtype_expr: "".to_string() }), Sort(_) => mapper.with_same_dtype(), Reverse => mapper.with_same_dtype(), ArgMin | ArgMax => mapper.with_dtype(IDX_DTYPE), @@ -60,6 +76,43 @@ impl ArrayFunction { } } +fn deserialize_dtype(dtype_expr: &str) -> PolarsResult> { + match dtype_expr.len() { + 0 => Ok(None), + _ => match serde_json::from_str::(dtype_expr) { + Ok(Expr::DtypeColumn(dtypes)) if dtypes.len() == 1 => Ok(Some(dtypes[0].clone())), + Ok(_) => Err( + polars_err!(ComputeError: "Expected a DtypeColumn expression with a single dtype"), + ), + Err(_) => Err(polars_err!(ComputeError: "Could not deserialize dtype expression")), + }, + } +} +fn array_output_type(input_fields: &[Field], kwargs: ArrayKwargs) -> PolarsResult { + let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)? + .unwrap_or(input_fields[0].dtype.clone()); + + for field in input_fields.iter() { + if !field.dtype().is_numeric() { + polars_bail!(ComputeError: "all input fields must be numeric") + } + } + + /* + // For now, allow casting to either the first, or provided dtype + for field in input_fields.iter().skip(1) { + if field.dtype != expected_dtype { + polars_bail!(ComputeError: "all input fields must have the same type") + } + } + */ + + Ok(Field::new( + PlSmallStr::from_static("array"), + DataType::Array(Box::new(expected_dtype), input_fields.len()), + )) +} + fn map_array_dtype_to_list_dtype(datatype: &DataType) -> PolarsResult { if let DataType::Array(inner, _) = datatype { Ok(DataType::List(inner.clone())) @@ -85,6 +138,7 @@ impl Display for ArrayFunction { Any => "any", #[cfg(feature = "array_any_all")] All => "all", + Array => "array", Sort(_) => "sort", Reverse => "reverse", ArgMin => "arg_min", @@ -118,6 +172,7 @@ impl From for SpecialEq> { Any => map!(any), #[cfg(feature = "array_any_all")] All => map!(all), + Array => map_as_slice!(array_new), Sort(options) => map!(sort, options), Reverse => map!(reverse), ArgMin => map!(arg_min), @@ -133,6 +188,86 @@ impl From for SpecialEq> { } } +// Create a new array from a slice of series +fn array_new(inputs: &[Column]) -> PolarsResult { + let kwargs = ArrayKwargs {dtype_expr: "".to_string()}; + let ref dtype = deserialize_dtype(&kwargs.dtype_expr)? + .unwrap_or(inputs[0].dtype().clone()); + + // This conversion is yuck, there is probably a standard way to go from &[Column] to &[Series] + let series: Vec = inputs.iter().map(|col| col.clone().take_materialized_series()).collect(); + + // Convert dtype to native numeric type and invoke array_numeric + let res_series = with_match_physical_numeric_polars_type!(dtype, |$T| { + array_numeric::<$T>(&series[..], dtype) + })?; + + Ok(res_series.into_column()) +} + +// Combine numeric series into an array +fn array_numeric<'a, T: PolarsNumericType>(inputs: &[Series], dtype: &DataType) + -> PolarsResult { + let rows = inputs[0].len(); + let cols = inputs.len(); + let capacity = cols * rows; + + let mut values: Vec = vec![T::Native::default(); capacity]; + + // Support for casting + // Cast fields to the target dtype as needed + let mut casts = HashMap::new(); + for j in 0..cols { + if inputs[j].dtype() != dtype { + let cast_input = inputs[j].cast(dtype)?; + casts.insert(j, cast_input); + } + } + + let mut cols_ca = Vec::new(); + for j in 0..cols { + if inputs[j].dtype() != dtype { + cols_ca.push(casts.get(&j).expect("expect conversion").unpack::()?); + } else { + cols_ca.push(inputs[j].unpack::()?); + } + } + + for i in 0..rows { + for j in 0..cols { + values[i * cols + j] = unsafe { cols_ca[j].value_unchecked(i) }; + } + } + + let validity = if cols_ca.iter().any(|col| col.has_nulls()) { + let mut validity = MutableBitmap::from_len_zeroed(capacity); + for (j, col) in cols_ca.iter().enumerate() { + let mut row_offset = 0; + for chunk in col.chunks() { + if let Some(chunk_validity) = chunk.validity() { + for set_bit in chunk_validity.true_idx_iter() { + validity.set(cols * (row_offset + set_bit) + j, true); + } + } else { + for chunk_row in 0..chunk.len() { + validity.set(cols * (row_offset + chunk_row) + j, true); + } + } + row_offset += chunk.len(); + } + } + Some(validity.into()) + } else { + None + }; + + let values_array = PrimitiveArray::from_vec(values).with_validity(validity); + let dtype = DataType::Array(Box::new(dtype.clone()), cols); + let arrow_dtype = dtype.to_arrow(CompatLevel::newest()); + let array = FixedSizeListArray::try_new(arrow_dtype.clone(), Box::new(values_array), None)?; + Ok(unsafe {Series::_try_from_arrow_unchecked("Array".into(), vec![Box::new(array)], &arrow_dtype)?}) +} + pub(super) fn max(s: &Column) -> PolarsResult { Ok(s.array()?.array_max().into()) } From cb895b20038ed37f6361408f03bd20f0253e9b96 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Fri, 27 Sep 2024 16:48:28 -0700 Subject: [PATCH 2/9] Add unit tests --- .../src/dsl/function_expr/array.rs | 92 ++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 99a72fec46f7..5b4a3a320a25 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -190,7 +190,10 @@ impl From for SpecialEq> { // Create a new array from a slice of series fn array_new(inputs: &[Column]) -> PolarsResult { - let kwargs = ArrayKwargs {dtype_expr: "".to_string()}; + let kwargs = ArrayKwargs { dtype_expr: "".to_string() }; + array_internal(inputs, kwargs) +} +fn array_internal(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { let ref dtype = deserialize_dtype(&kwargs.dtype_expr)? .unwrap_or(inputs[0].dtype().clone()); @@ -384,3 +387,90 @@ pub(super) fn shift(s: &[Column]) -> PolarsResult { ca.array_shift(n.as_materialized_series()).map(Column::from) } + + +#[cfg(test)] +mod test { + use polars_core::datatypes::Field; + use polars_core::frame::DataFrame; + use polars_core::prelude::{Column, Series}; + use super::*; + + #[test] + fn test_array_f64() { + println!("\ntest_array_f64"); + let f1 = Series::new("f1".into(), &[1.0, 2.0]); + let f2 = Series::new("f2".into(), &[3.0, 4.0]); + + let mut cols : Vec = Vec::new(); + cols.push(Column::Series(f1)); + cols.push(Column::Series(f2)); + + let array_df = DataFrame::new(cols.clone()).unwrap(); + println!("input df\n{}\n", &array_df); + + let mut fields: Vec = Vec::new(); + for col in &cols{ + let f: Field = (col.field().to_mut()).clone(); + fields.push(f); + } + let kwargs = crate::dsl::function_expr::array::ArrayKwargs {dtype_expr: "{\"DtypeColumn\":[\"Float64\"]}".to_string()}; + let expected_result = crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); + println!("expected result\n{:?}\n", &expected_result); + + let new_arr = crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); + println!("actual result\n{:?}", &new_arr); + + assert!(new_arr.is_ok()); + assert_eq!(new_arr.unwrap().dtype(), expected_result.dtype()); + } + + fn i32_series() -> (Vec, Vec, DataFrame){ + let f1 = Series::new("f1".into(), &[1, 2]); + let f2 = Series::new("f2".into(), &[3, 4]); + + let mut cols : Vec = Vec::new(); + cols.push(Column::Series(f1)); + cols.push(Column::Series(f2)); + + let array_df = DataFrame::new(cols.clone()).unwrap(); + println!("input df\n{}\n", &array_df); + + let mut fields: Vec = Vec::new(); + for col in &cols{ + let f: Field = (col.field().to_mut()).clone(); + fields.push(f); + } + (cols, fields, array_df) + } + + #[test] + fn test_array_i32() { + println!("\ntest_array_i32"); + let (_cols, fields, array_df) = i32_series(); + let kwargs = crate::dsl::function_expr::array::ArrayKwargs {dtype_expr: "{\"DtypeColumn\":[\"Int32\"]}".to_string()}; + let expected_result = crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); + println!("expected result\n{:?}\n", &expected_result); + + let new_arr = crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); + println!("actual result\n{:?}", &new_arr); + + assert!(new_arr.is_ok()); + assert_eq!(new_arr.unwrap().dtype(), expected_result.dtype()); + } + + #[test] + fn test_array_i32_converted() { + println!("\ntest_array_i32_converted"); + let (_cols, fields, array_df) = i32_series(); + let kwargs = crate::dsl::function_expr::array::ArrayKwargs {dtype_expr: "{\"DtypeColumn\":[\"Float64\"]}".to_string()}; + let expected_result = crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); + println!("expected result\n{:?}\n", &expected_result); + + let new_arr = crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); + println!("actual result\n{:?}", &new_arr); + + assert!(new_arr.is_ok()); + assert_eq!(new_arr.unwrap().dtype(), expected_result.dtype()); + } +} From 3462cfefe30690789cdd6602a4bfe0b3e3780910 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Mon, 30 Sep 2024 18:52:46 -0700 Subject: [PATCH 3/9] Add linkage to python and simple test. --- .cargo/config.toml | 8 ++ crates/polars-plan/src/dsl/array.rs | 17 ++++ crates/polars-python/src/functions/lazy.rs | 7 ++ py-polars/polars/__init__.py | 2 + py-polars/polars/functions/__init__.py | 2 + py-polars/polars/functions/as_datatype.py | 20 +++++ py-polars/src/lib.rs | 2 + .../unit/functions/as_datatype/test_array.py | 78 +++++++++++++++++++ 8 files changed, 136 insertions(+) create mode 100644 py-polars/tests/unit/functions/as_datatype/test_array.py diff --git a/.cargo/config.toml b/.cargo/config.toml index 1f678a632560..4b6473049ab8 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,3 +1,11 @@ [env] # Tune jemalloc (https://github.com/pola-rs/polars/issues/18088). JEMALLOC_SYS_WITH_MALLOC_CONF = "dirty_decay_ms:500,muzzy_decay_ms:-1" + +[target.'cfg(all())'] +rustflags = [ + "-C", "link-arg=-Wl,-rpath,.../pyenv.git/versions/3.10.15/lib", + "-C", "link-arg=-L.../pyenv.git/versions/3.10.15/lib", + "-C", "link-arg=-lpython3.10", +] + diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index 23faf363421c..c891ba6de7b0 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -1,4 +1,5 @@ use polars_core::prelude::*; + #[cfg(feature = "array_to_struct")] use polars_ops::chunked_array::array::{ arr_default_struct_name_gen, ArrToStructNameGenerator, ToStruct, @@ -194,3 +195,19 @@ impl ArrayNameSpace { ) } } + +pub fn array_from_expr, IE: Into + Clone>(s: E) -> PolarsResult { + let s: Vec<_> = s.as_ref().iter().map(|e| e.clone().into()).collect(); + + polars_ensure!(!s.is_empty(), ComputeError: "`array` needs one or more expressions"); + + Ok(Expr::Function { + input: s, + function: FunctionExpr::ArrayExpr(ArrayFunction::Array), + options: FunctionOptions { + collect_groups: ApplyOptions::ElementWise, + flags: FunctionFlags::default() | FunctionFlags::INPUT_WILDCARD_EXPANSION, + ..Default::default() + }, + }) +} diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index 7e3ea213667e..d369be986ff7 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -199,6 +199,13 @@ pub fn concat_list(s: Vec) -> PyResult { Ok(expr.into()) } +#[pyfunction] +pub fn array(s: Vec) -> PyResult { + let s = s.into_iter().map(|e| e.inner).collect::>(); + let expr = dsl::array_from_expr(s).map_err(PyPolarsErr::from)?; + Ok(expr.into()) +} + #[pyfunction] pub fn concat_str(s: Vec, separator: &str, ignore_nulls: bool) -> PyExpr { let s = s.into_iter().map(|e| e.inner).collect::>(); diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 10f0ee54228b..33fd3ceeba3a 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -87,6 +87,7 @@ collect_all_async, concat, concat_list, + array, concat_str, corr, count, @@ -315,6 +316,7 @@ "collect_all", "collect_all_async", "concat_list", + "array", "concat_str", "corr", "count", diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index fedd0ac2bff0..ac42a605bc8d 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -15,6 +15,7 @@ ) from polars.functions.as_datatype import ( concat_list, + array, concat_str, duration, format, @@ -124,6 +125,7 @@ "collect_all", "collect_all_async", "concat_list", + "array", "concat_str", "corr", "count", diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index 5937e5c5b091..fcc75cfdfd0f 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -501,6 +501,26 @@ def concat_list(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> return wrap_expr(plr.concat_list(exprs)) +def array(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: + """ + Horizontally concatenate columns into a single array column. + + Operates in linear time. + + Parameters + ---------- + exprs + Columns to concatenate into a single list column. Accepts expression input. + Strings are parsed as column names, other non-expression inputs are parsed as + literals. + *more_exprs + Additional columns to concatenate into a single list column, specified as + positional arguments. + """ + exprs = parse_into_list_of_expressions(exprs, *more_exprs) + return wrap_expr(plr.array(exprs)) + + @overload def struct( *exprs: IntoExpr | Iterable[IntoExpr], diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 484583f85150..6b6b516938b1 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -176,6 +176,8 @@ fn polars(py: Python, m: &Bound) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::concat_list)) .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::array)) + .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::concat_str)) .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::len)).unwrap(); diff --git a/py-polars/tests/unit/functions/as_datatype/test_array.py b/py-polars/tests/unit/functions/as_datatype/test_array.py new file mode 100644 index 000000000000..9ffc814d68fb --- /dev/null +++ b/py-polars/tests/unit/functions/as_datatype/test_array.py @@ -0,0 +1,78 @@ +import pytest + +import polars as pl +from polars.exceptions import ComputeError +from polars.testing import assert_frame_equal, assert_series_equal + + +def build_array_f64(): + df = pl.DataFrame( + [ + pl.Series("f1", [1, 2]), + pl.Series("f2", [3, 4]), + ], + schema={ + "f1": pl.Float64, + "f2": pl.Float64, + }, + ) + print(df) + + # Now we call our plugin: + # sleep(30) + result = print(df.with_columns(arr=mp.array(pl.all(), dtype=pl.Float64))) + print(result) + +def build_array_i32(): + df = pl.DataFrame( + [ + pl.Series("f1", [1, 2]), + pl.Series("f2", [3, None]), + ], + schema={ + "f1": pl.Int32, + "f2": pl.Int32, + }, + ) + print(df) + + # Now we call our plugin: + # sleep(30) + result = print(df.with_columns(arr=mp.array(pl.all(), dtype=pl.Int32))) + print(result) + +def build_array_i32_converted(): + df = pl.DataFrame( + [ + pl.Series("f1", [1, None]), + pl.Series("f2", [None, 4]), + ], + schema={ + "f1": pl.Int32, + "f2": pl.Int32, + }, + ) + print(df) + + # Now we call our plugin: + # sleep(30) + result = print(df.with_columns(arr=mp.array(pl.all(), dtype=pl.Float64))) + print(result) + +def test_array() -> None: + s0 = pl.Series("a", [1.0, 2.0], dtype=pl.Float64) + s1 = pl.Series("b", [3.0, 4.0], dtype=pl.Float64) + expected = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) + + rem = ''' + out = s0.list.concat([s1]) + assert_series_equal(out, expected) + + out = s0.list.concat(s1) + assert_series_equal(out, expected) + ''' + + df = pl.DataFrame([s0, s1]) + result = df.select(pl.array(["a", "b"]).alias("z"))["z"] + print(result) + assert_series_equal(result, expected) From 950fc014c08a56078ce1c3314b4a0a67f93aebfc Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Tue, 1 Oct 2024 19:12:35 -0700 Subject: [PATCH 4/9] Get dtype working as crude string --- Cargo.lock | 4 +++ crates/polars-plan/Cargo.toml | 1 + crates/polars-plan/src/dsl/array.rs | 15 ++++++++-- .../src/dsl/function_expr/array.rs | 28 +++++++++-------- .../polars-plan/src/dsl/function_expr/mod.rs | 2 +- crates/polars-python/src/functions/lazy.rs | 4 +-- py-polars/polars/functions/as_datatype.py | 11 ++++--- .../unit/functions/as_datatype/test_array.py | 30 ++++++++++++------- 8 files changed, 62 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7d94e852f8d..6b26cf9115d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -173,6 +173,9 @@ name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +dependencies = [ + "serde", +] [[package]] name = "arrow-array" @@ -3366,6 +3369,7 @@ name = "polars-plan" version = "0.43.1" dependencies = [ "ahash", + "arrayvec", "bitflags", "bytemuck", "bytes", diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index d5676fecdf8d..475c994dabc6 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -43,6 +43,7 @@ regex = { workspace = true, optional = true } serde = { workspace = true, features = ["rc"], optional = true } serde_json = { workspace = true, optional = false } strum_macros = { workspace = true } +arrayvec = { version = "0.7.6" , features = ["serde"]} [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index c891ba6de7b0..b825b2e690aa 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -1,3 +1,4 @@ +use arrayvec::ArrayString; use polars_core::prelude::*; #[cfg(feature = "array_to_struct")] @@ -5,7 +6,7 @@ use polars_ops::chunked_array::array::{ arr_default_struct_name_gen, ArrToStructNameGenerator, ToStruct, }; -use crate::dsl::function_expr::ArrayFunction; +use crate::dsl::function_expr::{ArrayFunction, ArrayKwargs}; use crate::prelude::*; /// Specialized expressions for [`Series`] of [`DataType::Array`]. @@ -196,14 +197,22 @@ impl ArrayNameSpace { } } -pub fn array_from_expr, IE: Into + Clone>(s: E) -> PolarsResult { +pub fn array_from_expr, IE: Into + Clone>(s: E, dtype_str: &str) -> PolarsResult { let s: Vec<_> = s.as_ref().iter().map(|e| e.clone().into()).collect(); polars_ensure!(!s.is_empty(), ComputeError: "`array` needs one or more expressions"); + // let mut kwargs = ArrayKwargs::default(); + // const max_sz: usize = kwargs.dtype_expr.capacity(); + const MAX_SZ: usize = 256; // kwargs.dtype_expr.capacity(); + let mut trunc_str = dtype_str.to_string(); + trunc_str.truncate(MAX_SZ); + let fixed_string = ArrayString::<{MAX_SZ}>::from(&trunc_str).unwrap(); + let kwargs = ArrayKwargs{dtype_expr: fixed_string}; + Ok(Expr::Function { input: s, - function: FunctionExpr::ArrayExpr(ArrayFunction::Array), + function: FunctionExpr::ArrayExpr(ArrayFunction::Array(kwargs)), options: FunctionOptions { collect_groups: ApplyOptions::ElementWise, flags: FunctionFlags::default() | FunctionFlags::INPUT_WILDCARD_EXPANSION, diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 5b4a3a320a25..a3e64c1e4434 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -6,16 +6,19 @@ use polars_core::with_match_physical_numeric_polars_type; use std::collections::HashMap; use arrow::bitmap::MutableBitmap; use arrow::array::{PrimitiveArray, FixedSizeListArray}; +use arrayvec::ArrayString; - -#[derive(Clone, Deserialize)] -struct ArrayKwargs { - // I guess DataType is not one of the serializable types? - // In the source code I see this done vie Wrap - dtype_expr: String, +#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash, Default)] +#[derive(Serialize, Deserialize)] +pub struct ArrayKwargs { + // Not sure how to get a serializable DataType here + // For prototype, use fixed size string + pub dtype_expr: ArrayString::<256>, } + + #[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum ArrayFunction { @@ -32,7 +35,7 @@ pub enum ArrayFunction { Any, #[cfg(feature = "array_any_all")] All, - Array, + Array(ArrayKwargs), Sort(SortOptions), Reverse, ArgMin, @@ -61,7 +64,7 @@ impl ArrayFunction { #[cfg(feature = "array_any_all")] Any | All => mapper.with_dtype(DataType::Boolean), // TODO: Figure out how to bind keyword argument - Array => array_output_type(mapper.args(), ArrayKwargs { dtype_expr: "".to_string() }), + Array(kwargs) => array_output_type(mapper.args(), kwargs), Sort(_) => mapper.with_same_dtype(), Reverse => mapper.with_same_dtype(), ArgMin | ArgMax => mapper.with_dtype(IDX_DTYPE), @@ -88,7 +91,7 @@ fn deserialize_dtype(dtype_expr: &str) -> PolarsResult> { }, } } -fn array_output_type(input_fields: &[Field], kwargs: ArrayKwargs) -> PolarsResult { +fn array_output_type(input_fields: &[Field], kwargs: &ArrayKwargs) -> PolarsResult { let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)? .unwrap_or(input_fields[0].dtype.clone()); @@ -138,7 +141,7 @@ impl Display for ArrayFunction { Any => "any", #[cfg(feature = "array_any_all")] All => "all", - Array => "array", + Array(_) => "array", Sort(_) => "sort", Reverse => "reverse", ArgMin => "arg_min", @@ -172,7 +175,7 @@ impl From for SpecialEq> { Any => map!(any), #[cfg(feature = "array_any_all")] All => map!(all), - Array => map_as_slice!(array_new), + Array(kwargs) => map_as_slice!(array_new, kwargs), Sort(options) => map!(sort, options), Reverse => map!(reverse), ArgMin => map!(arg_min), @@ -189,8 +192,7 @@ impl From for SpecialEq> { } // Create a new array from a slice of series -fn array_new(inputs: &[Column]) -> PolarsResult { - let kwargs = ArrayKwargs { dtype_expr: "".to_string() }; +fn array_new(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { array_internal(inputs, kwargs) } fn array_internal(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 6347f6cee7b4..f1b65ee3747c 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -74,7 +74,7 @@ use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; #[cfg(feature = "dtype-array")] -pub(crate) use array::ArrayFunction; +pub(crate) use array::{ArrayFunction, ArrayKwargs}; #[cfg(feature = "cov")] pub(crate) use correlation::CorrelationMethod; #[cfg(feature = "fused")] diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index d369be986ff7..82b23fe4b01c 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -200,9 +200,9 @@ pub fn concat_list(s: Vec) -> PyResult { } #[pyfunction] -pub fn array(s: Vec) -> PyResult { +pub fn array(s: Vec, dtype_str: &str) -> PyResult { let s = s.into_iter().map(|e| e.inner).collect::>(); - let expr = dsl::array_from_expr(s).map_err(PyPolarsErr::from)?; + let expr = dsl::array_from_expr(s, dtype_str).map_err(PyPolarsErr::from)?; Ok(expr.into()) } diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index fcc75cfdfd0f..29a8a4ad17fb 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -501,7 +501,8 @@ def concat_list(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> return wrap_expr(plr.concat_list(exprs)) -def array(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: + +def array(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, dtype: str = "") -> Expr: """ Horizontally concatenate columns into a single array column. @@ -510,15 +511,17 @@ def array(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr: Parameters ---------- exprs - Columns to concatenate into a single list column. Accepts expression input. + Columns to concatenate into a single array column. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. *more_exprs - Additional columns to concatenate into a single list column, specified as + Additional columns to concatenate into a single array column, specified as positional arguments. + dtype + A DataType expressed as a string """ exprs = parse_into_list_of_expressions(exprs, *more_exprs) - return wrap_expr(plr.array(exprs)) + return wrap_expr(plr.array(exprs, dtype)) @overload diff --git a/py-polars/tests/unit/functions/as_datatype/test_array.py b/py-polars/tests/unit/functions/as_datatype/test_array.py index 9ffc814d68fb..06bb7d2d91e9 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_array.py +++ b/py-polars/tests/unit/functions/as_datatype/test_array.py @@ -1,3 +1,5 @@ +from time import sleep + import pytest import polars as pl @@ -62,17 +64,25 @@ def build_array_i32_converted(): def test_array() -> None: s0 = pl.Series("a", [1.0, 2.0], dtype=pl.Float64) s1 = pl.Series("b", [3.0, 4.0], dtype=pl.Float64) - expected = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) + expected_f64 = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) + expected_i32 = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Int32, 2), strict=False) + df = pl.DataFrame([s0, s1]) + print("\n") - rem = ''' - out = s0.list.concat([s1]) - assert_series_equal(out, expected) + result = df.select(pl.array(["a", "b"]).alias("z"))["z"] + print("No Cast") + print(result) + print() + assert_series_equal(result, expected_f64) - out = s0.list.concat(s1) - assert_series_equal(out, expected) - ''' + result = df.select(pl.array(["a", "b"], dtype='{"DtypeColumn":["Float64"]}').alias("z"))["z"] + print("Cast to Float64") + print(result) + print() + assert_series_equal(result, expected_f64) - df = pl.DataFrame([s0, s1]) - result = df.select(pl.array(["a", "b"]).alias("z"))["z"] + result = df.select(pl.array(["a", "b"], dtype='{"DtypeColumn":["Int32"]}').alias("z"))["z"] + print("Cast to Int32") print(result) - assert_series_equal(result, expected) + print() + assert_series_equal(result, expected_i32) From a0bb53fe7a9f5d694b994c01d8ea27c0da385285 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Wed, 2 Oct 2024 12:51:18 -0700 Subject: [PATCH 5/9] Cleanup docs. Add tests. --- crates/polars-plan/src/dsl/array.rs | 2 +- .../src/dsl/function_expr/array.rs | 10 +---- .../unit/functions/as_datatype/test_array.py | 37 +++++++++++++++++++ 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index b825b2e690aa..e5ef33b85e0c 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -204,7 +204,7 @@ pub fn array_from_expr, IE: Into + Clone>(s: E, dtype_str: // let mut kwargs = ArrayKwargs::default(); // const max_sz: usize = kwargs.dtype_expr.capacity(); - const MAX_SZ: usize = 256; // kwargs.dtype_expr.capacity(); + const MAX_SZ: usize = 256; // hardcode for now, plan to replace this anyway let mut trunc_str = dtype_str.to_string(); trunc_str.truncate(MAX_SZ); let fixed_string = ArrayString::<{MAX_SZ}>::from(&trunc_str).unwrap(); diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index a3e64c1e4434..ddddb4f17469 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -92,6 +92,7 @@ fn deserialize_dtype(dtype_expr: &str) -> PolarsResult> { } } fn array_output_type(input_fields: &[Field], kwargs: &ArrayKwargs) -> PolarsResult { + // Expected target type is either the provided dtype or the type of the first column let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)? .unwrap_or(input_fields[0].dtype.clone()); @@ -101,15 +102,6 @@ fn array_output_type(input_fields: &[Field], kwargs: &ArrayKwargs) -> PolarsResu } } - /* - // For now, allow casting to either the first, or provided dtype - for field in input_fields.iter().skip(1) { - if field.dtype != expected_dtype { - polars_bail!(ComputeError: "all input fields must have the same type") - } - } - */ - Ok(Field::new( PlSmallStr::from_static("array"), DataType::Array(Box::new(expected_dtype), input_fields.len()), diff --git a/py-polars/tests/unit/functions/as_datatype/test_array.py b/py-polars/tests/unit/functions/as_datatype/test_array.py index 06bb7d2d91e9..8dab19e6475a 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_array.py +++ b/py-polars/tests/unit/functions/as_datatype/test_array.py @@ -86,3 +86,40 @@ def test_array() -> None: print(result) print() assert_series_equal(result, expected_i32) + +def test_array_nulls() -> None: + s0 = pl.Series("a", [1.0, None], dtype=pl.Float64) + s1 = pl.Series("b", [None, 4.0], dtype=pl.Float64) + expected_f64 = pl.Series("z", [[1.0, None], [None, 4.0]], dtype=pl.Array(pl.Float64, 2)) + df = pl.DataFrame([s0, s1]) + print("\n") + + result = df.select(pl.array(["a", "b"]).alias("z"))["z"] + print("No Cast") + print(result) + print() + assert_series_equal(result, expected_f64) + +def test_array_string() -> None: + s0 = pl.Series("a", ['1', '2']) + s1 = pl.Series("b", ['3', '4']) + expected_f64 = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) + df = pl.DataFrame([s0, s1]) + print("\n") + + result = df.select(pl.array(["a", "b"]).alias("z"))["z"] + print("No Cast") + print(result) + print() + assert_series_equal(result, expected_f64) + + result = df.select(pl.array(["a", "b"], dtype='{"DtypeColumn":["Float64"]}').alias("z"))["z"] + print("Cast to Float64") + print(result) + print() + assert_series_equal(result, expected_f64) + + + + + From 6316d22fa84c4a30ee1eb00016b3f73a38eac3be Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Wed, 2 Oct 2024 13:43:33 -0700 Subject: [PATCH 6/9] Centralize target type logic. --- .../src/dsl/function_expr/array.rs | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index ddddb4f17469..5a9d5ba27a1b 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -91,10 +91,25 @@ fn deserialize_dtype(dtype_expr: &str) -> PolarsResult> { }, } } + +fn get_expected_dtype(inputs: &[DataType], kwargs: &ArrayKwargs) -> PolarsResult { + // Decide what dtype to use for the constructed array + // For now, the logic is to use the dtype in kwargs, if specified + // Otherwise, use the type of the first column. + // + // An alternate idea could be to call try_get_supertype for the types. + // Or logic like DataFrame::get_supertype_all + // The problem is, I think this cast may be too general and we may only want to support primitive types + // Also, we don't support String yet. + let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)? + .unwrap_or(inputs[0].clone()); + Ok(expected_dtype) +} + fn array_output_type(input_fields: &[Field], kwargs: &ArrayKwargs) -> PolarsResult { // Expected target type is either the provided dtype or the type of the first column - let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)? - .unwrap_or(input_fields[0].dtype.clone()); + let dtypes: Vec = input_fields.into_iter().map(|f| f.dtype().clone()).collect(); + let expected_dtype = get_expected_dtype(&dtypes, kwargs)?; for field in input_fields.iter() { if !field.dtype().is_numeric() { @@ -188,15 +203,15 @@ fn array_new(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { array_internal(inputs, kwargs) } fn array_internal(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { - let ref dtype = deserialize_dtype(&kwargs.dtype_expr)? - .unwrap_or(inputs[0].dtype().clone()); + let dtypes: Vec = inputs.into_iter().map(|f| f.dtype().clone()).collect(); + let expected_dtype = get_expected_dtype(&dtypes, &kwargs)?; // This conversion is yuck, there is probably a standard way to go from &[Column] to &[Series] let series: Vec = inputs.iter().map(|col| col.clone().take_materialized_series()).collect(); // Convert dtype to native numeric type and invoke array_numeric - let res_series = with_match_physical_numeric_polars_type!(dtype, |$T| { - array_numeric::<$T>(&series[..], dtype) + let res_series = with_match_physical_numeric_polars_type!(expected_dtype, |$T| { + array_numeric::<$T>(&series[..], &expected_dtype) })?; Ok(res_series.into_column()) From 4f3c35457b370624edbb81f2de0d8bdc09afa53f Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Wed, 2 Oct 2024 18:29:26 -0700 Subject: [PATCH 7/9] Remove dated TODO comment. --- crates/polars-plan/src/dsl/function_expr/array.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 5a9d5ba27a1b..c23c4c9b2aa6 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -63,7 +63,6 @@ impl ArrayFunction { Median => mapper.map_to_float_dtype(), #[cfg(feature = "array_any_all")] Any | All => mapper.with_dtype(DataType::Boolean), - // TODO: Figure out how to bind keyword argument Array(kwargs) => array_output_type(mapper.args(), kwargs), Sort(_) => mapper.with_same_dtype(), Reverse => mapper.with_same_dtype(), From 4ae43167b05701b7350b7388206ca9b96cb4d109 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Wed, 2 Oct 2024 18:55:35 -0700 Subject: [PATCH 8/9] Run pre-commit reformatting. Remove dead code in as_datatype.py Fix minor syntax in array.rs --- crates/polars-plan/src/dsl/array.rs | 12 ++- .../src/dsl/function_expr/array.rs | 94 +++++++++------- py-polars/polars/__init__.py | 2 +- py-polars/polars/functions/__init__.py | 2 +- py-polars/polars/functions/as_datatype.py | 5 +- py-polars/src/lib.rs | 3 +- .../unit/functions/as_datatype/test_array.py | 100 +++++------------- 7 files changed, 97 insertions(+), 121 deletions(-) diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index e5ef33b85e0c..bc450aceed2c 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -1,6 +1,5 @@ use arrayvec::ArrayString; use polars_core::prelude::*; - #[cfg(feature = "array_to_struct")] use polars_ops::chunked_array::array::{ arr_default_struct_name_gen, ArrToStructNameGenerator, ToStruct, @@ -197,7 +196,10 @@ impl ArrayNameSpace { } } -pub fn array_from_expr, IE: Into + Clone>(s: E, dtype_str: &str) -> PolarsResult { +pub fn array_from_expr, IE: Into + Clone>( + s: E, + dtype_str: &str, +) -> PolarsResult { let s: Vec<_> = s.as_ref().iter().map(|e| e.clone().into()).collect(); polars_ensure!(!s.is_empty(), ComputeError: "`array` needs one or more expressions"); @@ -207,8 +209,10 @@ pub fn array_from_expr, IE: Into + Clone>(s: E, dtype_str: const MAX_SZ: usize = 256; // hardcode for now, plan to replace this anyway let mut trunc_str = dtype_str.to_string(); trunc_str.truncate(MAX_SZ); - let fixed_string = ArrayString::<{MAX_SZ}>::from(&trunc_str).unwrap(); - let kwargs = ArrayKwargs{dtype_expr: fixed_string}; + let fixed_string = ArrayString::<{ MAX_SZ }>::from(&trunc_str).unwrap(); + let kwargs = ArrayKwargs { + dtype_expr: fixed_string, + }; Ok(Expr::Function { input: s, diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index c23c4c9b2aa6..88baa90cc80a 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -1,24 +1,21 @@ +use std::collections::HashMap; + +use arrayvec::ArrayString; +use arrow::array::{FixedSizeListArray, PrimitiveArray}; +use arrow::bitmap::MutableBitmap; +use polars_core::with_match_physical_numeric_polars_type; use polars_ops::chunked_array::array::*; use super::*; use crate::{map, map_as_slice}; -use polars_core::with_match_physical_numeric_polars_type; -use std::collections::HashMap; -use arrow::bitmap::MutableBitmap; -use arrow::array::{PrimitiveArray, FixedSizeListArray}; -use arrayvec::ArrayString; - -#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash, Default)] -#[derive(Serialize, Deserialize)] +#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash, Default, Serialize, Deserialize)] pub struct ArrayKwargs { // Not sure how to get a serializable DataType here // For prototype, use fixed size string - pub dtype_expr: ArrayString::<256>, + pub dtype_expr: ArrayString<256>, } - - #[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum ArrayFunction { @@ -100,15 +97,14 @@ fn get_expected_dtype(inputs: &[DataType], kwargs: &ArrayKwargs) -> PolarsResult // Or logic like DataFrame::get_supertype_all // The problem is, I think this cast may be too general and we may only want to support primitive types // Also, we don't support String yet. - let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)? - .unwrap_or(inputs[0].clone()); + let expected_dtype = deserialize_dtype(&kwargs.dtype_expr)?.unwrap_or(inputs[0].clone()); Ok(expected_dtype) } fn array_output_type(input_fields: &[Field], kwargs: &ArrayKwargs) -> PolarsResult { // Expected target type is either the provided dtype or the type of the first column - let dtypes: Vec = input_fields.into_iter().map(|f| f.dtype().clone()).collect(); - let expected_dtype = get_expected_dtype(&dtypes, kwargs)?; + let dtypes: Vec = input_fields.iter().map(|f| f.dtype().clone()).collect(); + let expected_dtype = get_expected_dtype(&dtypes, kwargs)?; for field in input_fields.iter() { if !field.dtype().is_numeric() { @@ -202,11 +198,14 @@ fn array_new(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { array_internal(inputs, kwargs) } fn array_internal(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult { - let dtypes: Vec = inputs.into_iter().map(|f| f.dtype().clone()).collect(); - let expected_dtype = get_expected_dtype(&dtypes, &kwargs)?; + let dtypes: Vec = inputs.iter().map(|f| f.dtype().clone()).collect(); + let expected_dtype = get_expected_dtype(&dtypes, &kwargs)?; // This conversion is yuck, there is probably a standard way to go from &[Column] to &[Series] - let series: Vec = inputs.iter().map(|col| col.clone().take_materialized_series()).collect(); + let series: Vec = inputs + .iter() + .map(|col| col.clone().take_materialized_series()) + .collect(); // Convert dtype to native numeric type and invoke array_numeric let res_series = with_match_physical_numeric_polars_type!(expected_dtype, |$T| { @@ -217,8 +216,10 @@ fn array_internal(inputs: &[Column], kwargs: ArrayKwargs) -> PolarsResult(inputs: &[Series], dtype: &DataType) - -> PolarsResult { +fn array_numeric( + inputs: &[Series], + dtype: &DataType, +) -> PolarsResult { let rows = inputs[0].len(); let cols = inputs.len(); let capacity = cols * rows; @@ -275,8 +276,15 @@ fn array_numeric<'a, T: PolarsNumericType>(inputs: &[Series], dtype: &DataType) let values_array = PrimitiveArray::from_vec(values).with_validity(validity); let dtype = DataType::Array(Box::new(dtype.clone()), cols); let arrow_dtype = dtype.to_arrow(CompatLevel::newest()); - let array = FixedSizeListArray::try_new(arrow_dtype.clone(), Box::new(values_array), None)?; - Ok(unsafe {Series::_try_from_arrow_unchecked("Array".into(), vec![Box::new(array)], &arrow_dtype)?}) + let array = FixedSizeListArray::try_new( + arrow_dtype.clone(), + values_array.len(), + Box::new(values_array), + None, + )?; + Ok(unsafe { + Series::_try_from_arrow_unchecked("Array".into(), vec![Box::new(array)], &arrow_dtype)? + }) } pub(super) fn max(s: &Column) -> PolarsResult { @@ -396,12 +404,12 @@ pub(super) fn shift(s: &[Column]) -> PolarsResult { ca.array_shift(n.as_materialized_series()).map(Column::from) } - #[cfg(test)] mod test { use polars_core::datatypes::Field; use polars_core::frame::DataFrame; use polars_core::prelude::{Column, Series}; + use super::*; #[test] @@ -410,7 +418,7 @@ mod test { let f1 = Series::new("f1".into(), &[1.0, 2.0]); let f2 = Series::new("f2".into(), &[3.0, 4.0]); - let mut cols : Vec = Vec::new(); + let mut cols: Vec = Vec::new(); cols.push(Column::Series(f1)); cols.push(Column::Series(f2)); @@ -418,26 +426,30 @@ mod test { println!("input df\n{}\n", &array_df); let mut fields: Vec = Vec::new(); - for col in &cols{ + for col in &cols { let f: Field = (col.field().to_mut()).clone(); fields.push(f); } - let kwargs = crate::dsl::function_expr::array::ArrayKwargs {dtype_expr: "{\"DtypeColumn\":[\"Float64\"]}".to_string()}; - let expected_result = crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); + let kwargs = crate::dsl::function_expr::array::ArrayKwargs { + dtype_expr: "{\"DtypeColumn\":[\"Float64\"]}".to_string(), + }; + let expected_result = + crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); println!("expected result\n{:?}\n", &expected_result); - let new_arr = crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); + let new_arr = + crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); println!("actual result\n{:?}", &new_arr); assert!(new_arr.is_ok()); assert_eq!(new_arr.unwrap().dtype(), expected_result.dtype()); } - fn i32_series() -> (Vec, Vec, DataFrame){ + fn i32_series() -> (Vec, Vec, DataFrame) { let f1 = Series::new("f1".into(), &[1, 2]); let f2 = Series::new("f2".into(), &[3, 4]); - let mut cols : Vec = Vec::new(); + let mut cols: Vec = Vec::new(); cols.push(Column::Series(f1)); cols.push(Column::Series(f2)); @@ -445,7 +457,7 @@ mod test { println!("input df\n{}\n", &array_df); let mut fields: Vec = Vec::new(); - for col in &cols{ + for col in &cols { let f: Field = (col.field().to_mut()).clone(); fields.push(f); } @@ -456,11 +468,15 @@ mod test { fn test_array_i32() { println!("\ntest_array_i32"); let (_cols, fields, array_df) = i32_series(); - let kwargs = crate::dsl::function_expr::array::ArrayKwargs {dtype_expr: "{\"DtypeColumn\":[\"Int32\"]}".to_string()}; - let expected_result = crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); + let kwargs = crate::dsl::function_expr::array::ArrayKwargs { + dtype_expr: "{\"DtypeColumn\":[\"Int32\"]}".to_string(), + }; + let expected_result = + crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); println!("expected result\n{:?}\n", &expected_result); - let new_arr = crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); + let new_arr = + crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); println!("actual result\n{:?}", &new_arr); assert!(new_arr.is_ok()); @@ -471,11 +487,15 @@ mod test { fn test_array_i32_converted() { println!("\ntest_array_i32_converted"); let (_cols, fields, array_df) = i32_series(); - let kwargs = crate::dsl::function_expr::array::ArrayKwargs {dtype_expr: "{\"DtypeColumn\":[\"Float64\"]}".to_string()}; - let expected_result = crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); + let kwargs = crate::dsl::function_expr::array::ArrayKwargs { + dtype_expr: "{\"DtypeColumn\":[\"Float64\"]}".to_string(), + }; + let expected_result = + crate::dsl::function_expr::array::array_output_type(&fields, kwargs.clone()).unwrap(); println!("expected result\n{:?}\n", &expected_result); - let new_arr = crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); + let new_arr = + crate::dsl::function_expr::array::array_internal(array_df.get_columns(), kwargs); println!("actual result\n{:?}", &new_arr); assert!(new_arr.is_ok()); diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 33fd3ceeba3a..db2710b15663 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -80,6 +80,7 @@ arctan2d, arg_sort_by, arg_where, + array, business_day_count, coalesce, col, @@ -87,7 +88,6 @@ collect_all_async, concat, concat_list, - array, concat_str, corr, count, diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index ac42a605bc8d..24c9fcee2fe9 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -14,8 +14,8 @@ sum_horizontal, ) from polars.functions.as_datatype import ( - concat_list, array, + concat_list, concat_str, duration, format, diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index daf3f743692a..ffda41f959b4 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -502,8 +502,9 @@ def concat_list(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> return wrap_expr(plr.concat_list(exprs)) - -def array(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, dtype: str = "") -> Expr: +def array( + exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, dtype: str = "" +) -> Expr: """ Horizontally concatenate columns into a single array column. diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 771492797533..896105a3bc96 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -170,8 +170,7 @@ fn polars(py: Python, m: &Bound) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::concat_list)) .unwrap(); - m.add_wrapped(wrap_pyfunction!(functions::array)) - .unwrap(); + m.add_wrapped(wrap_pyfunction!(functions::array)).unwrap(); m.add_wrapped(wrap_pyfunction!(functions::concat_str)) .unwrap(); m.add_wrapped(wrap_pyfunction!(functions::len)).unwrap(); diff --git a/py-polars/tests/unit/functions/as_datatype/test_array.py b/py-polars/tests/unit/functions/as_datatype/test_array.py index 8dab19e6475a..989d186f95bd 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_array.py +++ b/py-polars/tests/unit/functions/as_datatype/test_array.py @@ -1,71 +1,16 @@ -from time import sleep - -import pytest - import polars as pl -from polars.exceptions import ComputeError -from polars.testing import assert_frame_equal, assert_series_equal +from polars.testing import assert_series_equal -def build_array_f64(): - df = pl.DataFrame( - [ - pl.Series("f1", [1, 2]), - pl.Series("f2", [3, 4]), - ], - schema={ - "f1": pl.Float64, - "f2": pl.Float64, - }, - ) - print(df) - - # Now we call our plugin: - # sleep(30) - result = print(df.with_columns(arr=mp.array(pl.all(), dtype=pl.Float64))) - print(result) - -def build_array_i32(): - df = pl.DataFrame( - [ - pl.Series("f1", [1, 2]), - pl.Series("f2", [3, None]), - ], - schema={ - "f1": pl.Int32, - "f2": pl.Int32, - }, - ) - print(df) - - # Now we call our plugin: - # sleep(30) - result = print(df.with_columns(arr=mp.array(pl.all(), dtype=pl.Int32))) - print(result) - -def build_array_i32_converted(): - df = pl.DataFrame( - [ - pl.Series("f1", [1, None]), - pl.Series("f2", [None, 4]), - ], - schema={ - "f1": pl.Int32, - "f2": pl.Int32, - }, - ) - print(df) - - # Now we call our plugin: - # sleep(30) - result = print(df.with_columns(arr=mp.array(pl.all(), dtype=pl.Float64))) - print(result) - def test_array() -> None: s0 = pl.Series("a", [1.0, 2.0], dtype=pl.Float64) s1 = pl.Series("b", [3.0, 4.0], dtype=pl.Float64) - expected_f64 = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) - expected_i32 = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Int32, 2), strict=False) + expected_f64 = pl.Series( + "z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2) + ) + expected_i32 = pl.Series( + "z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Int32, 2), strict=False + ) df = pl.DataFrame([s0, s1]) print("\n") @@ -75,22 +20,29 @@ def test_array() -> None: print() assert_series_equal(result, expected_f64) - result = df.select(pl.array(["a", "b"], dtype='{"DtypeColumn":["Float64"]}').alias("z"))["z"] + result = df.select( + pl.array(["a", "b"], dtype='{"DtypeColumn":["Float64"]}').alias("z") + )["z"] print("Cast to Float64") print(result) print() assert_series_equal(result, expected_f64) - result = df.select(pl.array(["a", "b"], dtype='{"DtypeColumn":["Int32"]}').alias("z"))["z"] + result = df.select( + pl.array(["a", "b"], dtype='{"DtypeColumn":["Int32"]}').alias("z") + )["z"] print("Cast to Int32") print(result) print() assert_series_equal(result, expected_i32) + def test_array_nulls() -> None: s0 = pl.Series("a", [1.0, None], dtype=pl.Float64) s1 = pl.Series("b", [None, 4.0], dtype=pl.Float64) - expected_f64 = pl.Series("z", [[1.0, None], [None, 4.0]], dtype=pl.Array(pl.Float64, 2)) + expected_f64 = pl.Series( + "z", [[1.0, None], [None, 4.0]], dtype=pl.Array(pl.Float64, 2) + ) df = pl.DataFrame([s0, s1]) print("\n") @@ -100,10 +52,13 @@ def test_array_nulls() -> None: print() assert_series_equal(result, expected_f64) + def test_array_string() -> None: - s0 = pl.Series("a", ['1', '2']) - s1 = pl.Series("b", ['3', '4']) - expected_f64 = pl.Series("z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2)) + s0 = pl.Series("a", ["1", "2"]) + s1 = pl.Series("b", ["3", "4"]) + expected_f64 = pl.Series( + "z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2) + ) df = pl.DataFrame([s0, s1]) print("\n") @@ -113,13 +68,10 @@ def test_array_string() -> None: print() assert_series_equal(result, expected_f64) - result = df.select(pl.array(["a", "b"], dtype='{"DtypeColumn":["Float64"]}').alias("z"))["z"] + result = df.select( + pl.array(["a", "b"], dtype='{"DtypeColumn":["Float64"]}').alias("z") + )["z"] print("Cast to Float64") print(result) print() assert_series_equal(result, expected_f64) - - - - - From 18dc37576db576744d473943ecf3b13003de82d6 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Fri, 18 Oct 2024 14:53:47 -0700 Subject: [PATCH 9/9] Add test for empty array. Still need to figure out what to do for this case. --- .../unit/functions/as_datatype/test_array.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/py-polars/tests/unit/functions/as_datatype/test_array.py b/py-polars/tests/unit/functions/as_datatype/test_array.py index 989d186f95bd..45e96bf4e6c1 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_array.py +++ b/py-polars/tests/unit/functions/as_datatype/test_array.py @@ -37,6 +37,23 @@ def test_array() -> None: assert_series_equal(result, expected_i32) +def test_array_empty() -> None: + s0 = pl.Series("a", [1.0, 2.0], dtype=pl.Float64) + s1 = pl.Series("b", [3.0, 4.0], dtype=pl.Float64) + expected_f64 = pl.Series( + "z", [[1.0, 3.0], [2.0, 4.0]], dtype=pl.Array(pl.Float64, 2) + ) + df = pl.DataFrame([s0, s1]) + print("\n") + + result = df.select(pl.array([], dtype='{"DtypeColumn":["Float64"]}').alias("z"))["z"] + print("Empty") + print(result) + print() + # assert_series_equal(result, expected_f64) + + + def test_array_nulls() -> None: s0 = pl.Series("a", [1.0, None], dtype=pl.Float64) s1 = pl.Series("b", [None, 4.0], dtype=pl.Float64)