diff --git a/DESCRIPTION b/DESCRIPTION index 542490ba7..b49462c77 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,7 @@ URL: https://pola-rs.github.io/r-polars/, https://github.com/pola-rs/r-polars, https://rpolars.r-universe.dev/polars Suggests: - arrow, + arrow (>= 15.0.1), bench, bit64, callr, @@ -32,7 +32,7 @@ Suggests: jsonlite, knitr, lubridate, - nanoarrow, + nanoarrow (>= 0.4.0), nycflights13, patrick, pillar, diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index f75fad6ae..cd880f567 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -64,8 +64,6 @@ arrow_stream_to_df <- function(robj_str) .Call(wrap__arrow_stream_to_df, robj_st arrow_stream_to_series <- function(robj_str) .Call(wrap__arrow_stream_to_series, robj_str) -export_df_to_arrow_stream <- function(robj_df, robj_str) .Call(wrap__export_df_to_arrow_stream, robj_df, robj_str) - mem_address <- function(robj) .Call(wrap__mem_address, robj) clone_robj <- function(robj) .Call(wrap__clone_robj, robj) @@ -204,7 +202,7 @@ RPolarsDataFrame$unnest <- function(names) .Call(wrap__RPolarsDataFrame__unnest, RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call(wrap__RPolarsDataFrame__partition_by, self, by, maintain_order, include_key) -RPolarsDataFrame$export_stream <- function(stream_ptr) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr)) +RPolarsDataFrame$export_stream <- function(stream_ptr, pl_flavor) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr, pl_flavor)) RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr) diff --git a/R/pkg-arrow.R b/R/pkg-arrow.R index 4d5013778..ea9910c64 100644 --- a/R/pkg-arrow.R +++ b/R/pkg-arrow.R @@ -1,5 +1,6 @@ #' Create a arrow Table from a Polars object #' +#' @inheritParams DataFrame_write_ipc #' @param x [A Polars DataFrame][DataFrame_class] #' @param ... Ignored #' @rdname S3_as_arrow_table @@ -9,8 +10,9 @@ #' pl_df = as_polars_df(mtcars) #' as_arrow_table(pl_df) # exported in zzz.R -as_arrow_table.RPolarsDataFrame = function(x, ...) { - reader = as_record_batch_reader.RPolarsDataFrame(x) +as_arrow_table.RPolarsDataFrame = function(x, ..., future = FALSE) { + reader = result(as_record_batch_reader.RPolarsDataFrame(x, future = future)) |> + unwrap("in as_arrow_table():") reader$read_table() } @@ -25,12 +27,17 @@ as_arrow_table.RPolarsDataFrame = function(x, ...) { #' pl_df = as_polars_df(mtcars) #' as_record_batch_reader(pl_df) # exported in zzz.R -as_record_batch_reader.RPolarsDataFrame = function(x, ...) { +as_record_batch_reader.RPolarsDataFrame = function(x, ..., future = FALSE) { + if (!is_scalar_bool(future)) { + Err_plain("`future` argument must be `TRUE` or `FALSE`") |> + unwrap("in as_record_batch_reader():") + } + # https://github.com/apache/arrow/issues/39793 allocate_arrow_array_stream = utils::getFromNamespace("allocate_arrow_array_stream", "arrow") external_pointer_addr_character = utils::getFromNamespace("external_pointer_addr_character", "arrow") stream = allocate_arrow_array_stream() - .pr$DataFrame$export_stream(x, external_pointer_addr_character(stream)) + .pr$DataFrame$export_stream(x, external_pointer_addr_character(stream), future) arrow::RecordBatchReader$import_from_c(stream) } diff --git a/R/pkg-nanoarrow.R b/R/pkg-nanoarrow.R index 947493277..f8b4bc540 100644 --- a/R/pkg-nanoarrow.R +++ b/R/pkg-nanoarrow.R @@ -1,6 +1,7 @@ #' Create a nanoarrow_array_stream from a Polars object #' #' @inheritParams as_arrow_table.RPolarsDataFrame +#' @inheritParams DataFrame_write_ipc #' @param schema must stay at default value NULL #' @rdname S3_as_nanoarrow_array_stream #' @examples @@ -10,18 +11,29 @@ #' nanoarrow_array_stream = as_nanoarrow_array_stream(pl_df) #' as.data.frame(nanoarrow_array_stream) # exported in zzz.R -as_nanoarrow_array_stream.RPolarsDataFrame = function(x, ..., schema = NULL) { +as_nanoarrow_array_stream.RPolarsDataFrame = function(x, ..., schema = NULL, future = FALSE) { + uw = \(res) unwrap("in as_nanoarrow_array_stream():") + # Don't support the schema argument yet - stopifnot(is.null(schema)) + if (!is.null(schema)) { + Err_plain("The `schema` argument is not supported yet") |> + uw() + } + + if (!is_scalar_bool(future)) { + Err_plain("`future` argument must be `TRUE` or `FALSE`") |> + uw() + } + stream = nanoarrow::nanoarrow_allocate_array_stream() - .pr$DataFrame$export_stream(x, nanoarrow::nanoarrow_pointer_addr_chr(stream)) + .pr$DataFrame$export_stream(x, nanoarrow::nanoarrow_pointer_addr_chr(stream), future) stream } #' Infer nanoarrow schema from a Polars object #' -#' @inheritParams as_arrow_table.RPolarsDataFrame +#' @inheritParams as_nanoarrow_array_stream.RPolarsDataFrame #' @rdname S3_infer_nanoarrow_schema #' @examples #' library(nanoarrow) @@ -29,6 +41,8 @@ as_nanoarrow_array_stream.RPolarsDataFrame = function(x, ..., schema = NULL) { #' #' infer_nanoarrow_schema(pl_df) # exported in zzz.R -infer_nanoarrow_schema.RPolarsDataFrame = function(x, ...) { - as_nanoarrow_array_stream.RPolarsDataFrame(x)$get_schema() +infer_nanoarrow_schema.RPolarsDataFrame = function(x, ..., future = FALSE) { + as_nanoarrow_array_stream.RPolarsDataFrame(x, future = future)$get_schema() |> + result() |> + unwrap("in infer_nanoarrow_schema():") } diff --git a/man/S3_as_arrow_table.Rd b/man/S3_as_arrow_table.Rd index 0eac43626..e1e2a38dd 100644 --- a/man/S3_as_arrow_table.Rd +++ b/man/S3_as_arrow_table.Rd @@ -4,12 +4,17 @@ \alias{as_arrow_table.RPolarsDataFrame} \title{Create a arrow Table from a Polars object} \usage{ -\method{as_arrow_table}{RPolarsDataFrame}(x, ...) +\method{as_arrow_table}{RPolarsDataFrame}(x, ..., future = FALSE) } \arguments{ \item{x}{\link[=DataFrame_class]{A Polars DataFrame}} \item{...}{Ignored} + +\item{future}{Setting this to \code{TRUE} will write Polars' internal data structures that +might not be available by other Arrow implementations. +This functionality is considered \strong{unstable}. +It may be changed at any point without it being considered a breaking change.} } \description{ Create a arrow Table from a Polars object diff --git a/man/S3_as_nanoarrow_array_stream.Rd b/man/S3_as_nanoarrow_array_stream.Rd index 7d7014efd..1e68603f0 100644 --- a/man/S3_as_nanoarrow_array_stream.Rd +++ b/man/S3_as_nanoarrow_array_stream.Rd @@ -4,7 +4,7 @@ \alias{as_nanoarrow_array_stream.RPolarsDataFrame} \title{Create a nanoarrow_array_stream from a Polars object} \usage{ -\method{as_nanoarrow_array_stream}{RPolarsDataFrame}(x, ..., schema = NULL) +\method{as_nanoarrow_array_stream}{RPolarsDataFrame}(x, ..., schema = NULL, future = FALSE) } \arguments{ \item{x}{\link[=DataFrame_class]{A Polars DataFrame}} @@ -12,6 +12,11 @@ \item{...}{Ignored} \item{schema}{must stay at default value NULL} + +\item{future}{Setting this to \code{TRUE} will write Polars' internal data structures that +might not be available by other Arrow implementations. +This functionality is considered \strong{unstable}. +It may be changed at any point without it being considered a breaking change.} } \description{ Create a nanoarrow_array_stream from a Polars object diff --git a/man/S3_as_record_batch_reader.Rd b/man/S3_as_record_batch_reader.Rd index c27514e02..af9de19eb 100644 --- a/man/S3_as_record_batch_reader.Rd +++ b/man/S3_as_record_batch_reader.Rd @@ -4,12 +4,17 @@ \alias{as_record_batch_reader.RPolarsDataFrame} \title{Create a arrow RecordBatchReader from a Polars object} \usage{ -\method{as_record_batch_reader}{RPolarsDataFrame}(x, ...) +\method{as_record_batch_reader}{RPolarsDataFrame}(x, ..., future = FALSE) } \arguments{ \item{x}{\link[=DataFrame_class]{A Polars DataFrame}} \item{...}{Ignored} + +\item{future}{Setting this to \code{TRUE} will write Polars' internal data structures that +might not be available by other Arrow implementations. +This functionality is considered \strong{unstable}. +It may be changed at any point without it being considered a breaking change.} } \description{ Create a arrow RecordBatchReader from a Polars object diff --git a/man/S3_infer_nanoarrow_schema.Rd b/man/S3_infer_nanoarrow_schema.Rd index afb52c98f..cb9eeab0f 100644 --- a/man/S3_infer_nanoarrow_schema.Rd +++ b/man/S3_infer_nanoarrow_schema.Rd @@ -4,12 +4,17 @@ \alias{infer_nanoarrow_schema.RPolarsDataFrame} \title{Infer nanoarrow schema from a Polars object} \usage{ -\method{infer_nanoarrow_schema}{RPolarsDataFrame}(x, ...) +\method{infer_nanoarrow_schema}{RPolarsDataFrame}(x, ..., future = FALSE) } \arguments{ \item{x}{\link[=DataFrame_class]{A Polars DataFrame}} \item{...}{Ignored} + +\item{future}{Setting this to \code{TRUE} will write Polars' internal data structures that +might not be available by other Arrow implementations. +This functionality is considered \strong{unstable}. +It may be changed at any point without it being considered a breaking change.} } \description{ Infer nanoarrow schema from a Polars object diff --git a/src/rust/src/arrow_interop/to_rust.rs b/src/rust/src/arrow_interop/to_rust.rs index ecbc73657..a38126256 100644 --- a/src/rust/src/arrow_interop/to_rust.rs +++ b/src/rust/src/arrow_interop/to_rust.rs @@ -154,14 +154,3 @@ fn consume_arrow_stream_to_series(boxed_stream: Box) -> R } Ok(s) } - -pub unsafe fn export_df_as_stream(df: pl::DataFrame, robj_str_ref: &Robj) -> RResult<()> { - let stream_ptr = - crate::utils::robj_str_ptr_to_usize(robj_str_ref)? as *mut ffi::ArrowArrayStream; - let schema = df.schema().to_arrow(true); - let data_type = pl::ArrowDataType::Struct(schema.fields); - let field = pl::ArrowField::new("", data_type, false); - let iter_boxed = Box::new(crate::rdataframe::OwnedDataFrameIterator::new(df)); - unsafe { *stream_ptr = ffi::export_iterator(iter_boxed, field) }; - Ok(()) -} diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 3290f748d..9b2884119 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -33,11 +33,12 @@ pub struct OwnedDataFrameIterator { data_type: arrow::datatypes::ArrowDataType, idx: usize, n_chunks: usize, + pl_flavor: bool, } impl OwnedDataFrameIterator { - pub fn new(df: polars::frame::DataFrame) -> Self { - let schema = df.schema().to_arrow(false); + pub fn new(df: polars::frame::DataFrame, pl_flavor: bool) -> Self { + let schema = df.schema().to_arrow(pl_flavor); let data_type = ArrowDataType::Struct(schema.fields); let vs = df.get_columns().to_vec(); Self { @@ -45,6 +46,7 @@ impl OwnedDataFrameIterator { data_type, idx: 0, n_chunks: df.n_chunks(), + pl_flavor, } } } @@ -60,7 +62,7 @@ impl Iterator for OwnedDataFrameIterator { let batch_cols = self .columns .iter() - .map(|s| s.to_arrow(self.idx, false)) + .map(|s| s.to_arrow(self.idx, self.pl_flavor)) .collect(); self.idx += 1; @@ -346,12 +348,12 @@ impl RPolarsDataFrame { Ok(List::from_values(vec)) } - pub fn export_stream(&self, stream_ptr: &str) { - let schema = self.0.schema().to_arrow(false); + pub fn export_stream(&self, stream_ptr: &str, pl_flavor: bool) { + let schema = self.0.schema().to_arrow(pl_flavor); let data_type = ArrowDataType::Struct(schema.fields); let field = ArrowField::new("", data_type, false); - let iter_boxed = Box::new(OwnedDataFrameIterator::new(self.0.clone())); + let iter_boxed = Box::new(OwnedDataFrameIterator::new(self.0.clone(), pl_flavor)); let mut stream = arrow::ffi::export_iterator(iter_boxed, field); let stream_out_ptr_addr: usize = stream_ptr.parse().unwrap(); let stream_out_ptr = stream_out_ptr_addr as *mut arrow::ffi::ArrowArrayStream; diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index b31edb78b..377f666cb 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -218,17 +218,6 @@ fn arrow_stream_to_series(robj_str: Robj) -> RResult { Ok(RPolarsSeries(s).into_robj()) } -#[extendr] -unsafe fn export_df_to_arrow_stream(robj_df: Robj, robj_str: Robj) -> RResult { - let res: ExternalPtr = robj_df.try_into()?; - let pl_df = RPolarsDataFrame(res.0.clone()).0; - //safety robj_str must be ptr to a arrow2 stream ready to export into - unsafe { - crate::arrow_interop::to_rust::export_df_as_stream(pl_df, &robj_str)?; - } - Ok(robj_str) -} - #[extendr] pub fn dtype_str_repr(dtype: Robj) -> RResult { let dtype = robj_to!(RPolarsDataType, dtype)?.0; @@ -474,7 +463,6 @@ extendr_module! { fn new_arrow_stream; fn arrow_stream_to_df; fn arrow_stream_to_series; - fn export_df_to_arrow_stream; //robj meta fn mem_address; diff --git a/tests/testthat/test-pkg-arrow.R b/tests/testthat/test-pkg-arrow.R index 336ddc4df..c8ce5965b 100644 --- a/tests/testthat/test-pkg-arrow.R +++ b/tests/testthat/test-pkg-arrow.R @@ -6,8 +6,7 @@ test_that("as_record_batch_reader() works for DataFrame", { expect_s3_class(reader, "RecordBatchReader") expect_identical( - # two as.data.frame()s because arrow sometimes returns a tibble here - as.data.frame(as.data.frame(reader)), + as.data.frame(reader), data.frame(a = 1L, b = "two") ) }) @@ -20,8 +19,12 @@ test_that("as_arrow_table() works for DataFrame", { expect_s3_class(table, "Table") expect_identical( - # two as.data.frame()s because arrow sometimes returns a tibble here - as.data.frame(as.data.frame(table)), + as.data.frame(table), data.frame(a = 1L, b = "two") ) + + expect_identical( + arrow::as_arrow_table(df, future = TRUE)$b$type$ToString(), + "string_view" + ) }) diff --git a/tests/testthat/test-pkg-nanoarrow.R b/tests/testthat/test-pkg-nanoarrow.R index 93b5d2dbf..2216e60b6 100644 --- a/tests/testthat/test-pkg-nanoarrow.R +++ b/tests/testthat/test-pkg-nanoarrow.R @@ -8,6 +8,14 @@ test_that("as_nanoarrow_array_stream() works for DataFrame", { as.data.frame(stream), data.frame(a = 1L, b = "two") ) + + # nanoarrow does not support the string view type yet + # https://github.com/apache/arrow-nanoarrow/pull/367 + expect_grepl_error( + nanoarrow::as_nanoarrow_array_stream(df, future = TRUE) |> + as.data.frame(), + "Unknown format: 'vu'" + ) }) test_that("infer_nanoarrow_schema() works for DataFrame", {