From ec11a68246744e2995feafe9edc87db71d97e274 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 28 Oct 2022 16:55:00 +0200 Subject: [PATCH 1/3] chore(rust): update arrow --- Cargo.toml | 6 +-- .../src/kernels/rolling/nulls/min_max.rs | 4 +- polars/polars-arrow/src/trusted_len/mod.rs | 11 ++++- .../src/chunked_array/iterator/mod.rs | 42 +++++++++++++------ .../src/chunked_array/ops/apply.rs | 2 +- polars/polars-core/src/series/iterator.rs | 31 ++++++++++---- .../nan_propagating_aggregate.rs | 1 + py-polars/Cargo.lock | 3 +- 8 files changed, 69 insertions(+), 31 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 93e20495b0c5..cf0f1bb32145 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,10 +32,10 @@ bitflags = "1.3" [workspace.dependencies.arrow] package = "arrow2" # git = "https://github.com/jorgecarleitao/arrow2" -# git = "https://github.com/ritchie46/arrow2" -# rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd" +git = "https://github.com/ritchie46/arrow2" +# rev = "e106cff24dc0c8942603712d7332a97871dce44e" # path = "../../../arrow2" -# branch = "comparison_and_validity" +branch = "2022_10_28" version = "0.14.1" default-features = false features = [ diff --git a/polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs b/polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs index 28da46ba7224..4d63b1724813 100644 --- a/polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs +++ b/polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs @@ -1,4 +1,4 @@ -use arrow::bitmap::utils::{count_zeros, zip_validity}; +use arrow::bitmap::utils::{count_zeros, ZipValidityIter}; use nulls; use nulls::{rolling_apply_agg_window, RollingAggWindowNulls}; @@ -9,7 +9,7 @@ pub fn is_reverse_sorted_max_nulls( validity: &Bitmap, ) -> bool { let mut current_max = None; - for opt_v in zip_validity(values.iter(), Some(validity.iter())) { + for opt_v in ZipValidityIter::new(values.iter(), validity.iter()) { match (current_max, opt_v) { // do nothing (None, None) => {} diff --git a/polars/polars-arrow/src/trusted_len/mod.rs b/polars/polars-arrow/src/trusted_len/mod.rs index c3c73cfca8cf..58daed978f8c 100644 --- a/polars/polars-arrow/src/trusted_len/mod.rs +++ b/polars/polars-arrow/src/trusted_len/mod.rs @@ -5,7 +5,7 @@ mod rev; use std::iter::Scan; use std::slice::Iter; -use arrow::bitmap::utils::{BitmapIter, ZipValidity}; +use arrow::bitmap::utils::{BitmapIter, ZipValidity, ZipValidityIter}; pub use push_unchecked::*; pub use rev::FromIteratorReversed; @@ -66,7 +66,14 @@ unsafe impl, J> TrustedLen for TrustMyLength {} unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} unsafe impl TrustedLen for arrow::array::Utf8ValuesIter<'_, i64> {} unsafe impl TrustedLen for arrow::array::BinaryValueIter<'_, i64> {} -unsafe impl> TrustedLen for ZipValidity<'_, T, I> {} +unsafe impl, V: TrustedLen + Iterator> TrustedLen + for ZipValidityIter +{ +} +unsafe impl, V: TrustedLen + Iterator> TrustedLen + for ZipValidity +{ +} unsafe impl TrustedLen for BitmapIter<'_> {} unsafe impl TrustedLen for std::iter::StepBy {} diff --git a/polars/polars-core/src/chunked_array/iterator/mod.rs b/polars/polars-core/src/chunked_array/iterator/mod.rs index 8f7fa9d34ba3..9233551995f5 100644 --- a/polars/polars-core/src/chunked_array/iterator/mod.rs +++ b/polars/polars-core/src/chunked_array/iterator/mod.rs @@ -306,18 +306,36 @@ impl<'a> IntoIterator for &'a ListChunked { fn into_iter(self) -> Self::IntoIter { let dtype = self.inner_dtype(); - // we know that we only iterate over length == self.len() - unsafe { - Box::new( - self.downcast_iter() - .flat_map(|arr| arr.iter()) - .trust_my_length(self.len()) - .map(move |arr| { - arr.map(|arr| { - Series::from_chunks_and_dtype_unchecked("", vec![arr], &dtype) - }) - }), - ) + if self.null_count() == 0 { + // we know that we only iterate over length == self.len() + unsafe { + Box::new( + self.downcast_iter() + .flat_map(|arr| arr.iter().unwrap_required()) + .trust_my_length(self.len()) + .map(move |arr| { + Some(Series::from_chunks_and_dtype_unchecked( + "", + vec![arr], + &dtype, + )) + }), + ) + } + } else { + // we know that we only iterate over length == self.len() + unsafe { + Box::new( + self.downcast_iter() + .flat_map(|arr| arr.iter().unwrap_optional()) + .trust_my_length(self.len()) + .map(move |arr| { + arr.map(|arr| { + Series::from_chunks_and_dtype_unchecked("", vec![arr], &dtype) + }) + }), + ) + } } } } diff --git a/polars/polars-core/src/chunked_array/ops/apply.rs b/polars/polars-core/src/chunked_array/ops/apply.rs index 271445fc497c..c4e06691f899 100644 --- a/polars/polars-core/src/chunked_array/ops/apply.rs +++ b/polars/polars-core/src/chunked_array/ops/apply.rs @@ -652,7 +652,7 @@ impl<'a> ChunkApply<'a, Series, Series> for ListChunked { }); f(x) }); - let len = values.len(); + let len = array.len(); // we know the iterators len unsafe { diff --git a/polars/polars-core/src/series/iterator.rs b/polars/polars-core/src/series/iterator.rs index 9012ad72262e..5dce58b56d0c 100644 --- a/polars/polars-core/src/series/iterator.rs +++ b/polars/polars-core/src/series/iterator.rs @@ -105,21 +105,34 @@ impl Series { }) } } else { - // TODO! null_count paths, but first exactsize iters must be implmeneted upstream match dtype { DataType::Utf8 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - Box::new(arr.iter().map(|value| match value { - Some(value) => AnyValue::Utf8(value), - None => AnyValue::Null, - })) as Box> + '_> + if arr.null_count() == 0 { + Box::new(arr.values_iter().map(AnyValue::Utf8)) + as Box> + '_> + } else { + let zipvalid = arr.iter(); + Box::new(zipvalid.unwrap_optional().map(|v| match v { + Some(value) => AnyValue::Utf8(value), + None => AnyValue::Null, + })) + as Box> + '_> + } } DataType::Boolean => { let arr = arr.as_any().downcast_ref::().unwrap(); - Box::new(arr.iter().map(|value| match value { - Some(value) => AnyValue::Boolean(value), - None => AnyValue::Null, - })) as Box> + '_> + if arr.null_count() == 0 { + Box::new(arr.values_iter().map(AnyValue::Boolean)) + as Box> + '_> + } else { + let zipvalid = arr.iter(); + Box::new(zipvalid.unwrap_optional().map(|v| match v { + Some(value) => AnyValue::Boolean(value), + None => AnyValue::Null, + })) + as Box> + '_> + } } _ => Box::new(self.iter()), } diff --git a/polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs b/polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs index 7caa84091937..8a37ba3dd739 100644 --- a/polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs +++ b/polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs @@ -43,6 +43,7 @@ where arr.values().iter().copied().fold_first_(min_or_max_fn) } else { arr.iter() + .unwrap_optional() .map(|opt| opt.copied()) .fold_first_(|a, b| match (a, b) { (Some(a), Some(b)) => Some(min_or_max_fn(a, b)), diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 80cdb89d9c69..dc697603a20f 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -95,8 +95,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee6f62e41078c967a4c063fcbdfd3801a2a9632276402c045311c4d73d0845f3" +source = "git+https://github.com/ritchie46/arrow2?branch=2022_10_28#86c638612e9017f07784f5480e6ebe0b97eb6528" dependencies = [ "ahash 0.7.6", "arrow-format", From b1c880c6231079ae37bccbd0e74e1302e53b66b7 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 29 Oct 2022 12:04:19 +0200 Subject: [PATCH 2/3] fix python --- polars/polars-arrow/src/kernels/list.rs | 4 +--- polars/polars-core/src/chunked_array/iterator/mod.rs | 2 +- py-polars/Cargo.lock | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/polars/polars-arrow/src/kernels/list.rs b/polars/polars-arrow/src/kernels/list.rs index 433594180e19..9cb5b0ac486f 100644 --- a/polars/polars-arrow/src/kernels/list.rs +++ b/polars/polars-arrow/src/kernels/list.rs @@ -140,9 +140,7 @@ mod test { let out = sublist_get_indexes(&arr, 1); assert_eq!( - out.into_iter() - .map(|opt_v| opt_v.cloned()) - .collect::>(), + out.into_iter().collect::>(), &[None, None, None, Some(4), Some(7), Some(10)] ); } diff --git a/polars/polars-core/src/chunked_array/iterator/mod.rs b/polars/polars-core/src/chunked_array/iterator/mod.rs index 9233551995f5..e6bc18397fae 100644 --- a/polars/polars-core/src/chunked_array/iterator/mod.rs +++ b/polars/polars-core/src/chunked_array/iterator/mod.rs @@ -327,7 +327,7 @@ impl<'a> IntoIterator for &'a ListChunked { unsafe { Box::new( self.downcast_iter() - .flat_map(|arr| arr.iter().unwrap_optional()) + .flat_map(|arr| arr.iter()) .trust_my_length(self.len()) .map(move |arr| { arr.map(|arr| { diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index dc697603a20f..706927022edc 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.14.2" -source = "git+https://github.com/ritchie46/arrow2?branch=2022_10_28#86c638612e9017f07784f5480e6ebe0b97eb6528" +source = "git+https://github.com/ritchie46/arrow2?branch=2022_10_28#1c366cd403bb1eb93b9f86e2a77ad589aec95b82" dependencies = [ "ahash 0.7.6", "arrow-format", From efc7ad8c48be42ec2a633f776d17787e051d2312 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 6 Nov 2022 09:44:06 +0100 Subject: [PATCH 3/3] include parquet fix --- Cargo.toml | 2 +- py-polars/Cargo.lock | 36 +++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cf0f1bb32145..0f16428fcc14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ package = "arrow2" git = "https://github.com/ritchie46/arrow2" # rev = "e106cff24dc0c8942603712d7332a97871dce44e" # path = "../../../arrow2" -branch = "2022_10_28" +branch = "2022_11_06" version = "0.14.1" default-features = false features = [ diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 706927022edc..0acca510f88f 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.14.2" -source = "git+https://github.com/ritchie46/arrow2?branch=2022_10_28#1c366cd403bb1eb93b9f86e2a77ad589aec95b82" +source = "git+https://github.com/ritchie46/arrow2?branch=2022_11_06#161901432f645842838d63360bb0bae81789c17c" dependencies = [ "ahash 0.7.6", "arrow-format", @@ -120,6 +120,7 @@ dependencies = [ "parquet2", "regex", "regex-syntax", + "rustc_version", "simdutf8", "streaming-iterator", "strength_reduce", @@ -239,18 +240,18 @@ checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" [[package]] name = "bytemuck" -version = "1.12.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aec14f5d4e6e3f927cd0c81f72e5710d95ee9019fbeb4b3021193867491bfd8" +checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9e1f5fa78f69496407a27ae9ed989e3c3b072310286f5ef385525e4cbc24a9" +checksum = "5fe233b960f12f8007e3db2d136e3cb1c291bfd7396e384ee76025fc1a3932b4" dependencies = [ "proc-macro2", "quote", @@ -1627,9 +1628,9 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" @@ -1829,9 +1830,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" dependencies = [ "aho-corasick", "memchr", @@ -1840,9 +1841,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.27" +version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" [[package]] name = "rle-decode-fast" @@ -1850,6 +1851,15 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustversion" version = "1.0.9" @@ -2077,9 +2087,9 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c02424087780c9b71cc96799eaeddff35af2bc513278cda5c99fc1f5d026d3c1" +checksum = "9410d0f6853b1d94f0e519fb95df60f29d2c1eff2d921ffdf01a4c8a3b54f12d" [[package]] name = "termcolor"