diff --git a/R/DenseArray.R b/R/DenseArray.R index 5b294cbe3d..5caf17f065 100644 --- a/R/DenseArray.R +++ b/R/DenseArray.R @@ -1,6 +1,6 @@ #' @exportClass tiledb_dense setClass("tiledb_dense", - slots = list(ctx = "tiledb_ctx", uri = "character", ptr = "externalptr")) + slots = list(ctx = "tiledb_ctx", uri = "character", as.data.frame = "logical", ptr = "externalptr")) #' Constructs a tiledb_dense object backed by a persisted tiledb array uri #' @@ -9,7 +9,7 @@ setClass("tiledb_dense", #' @param query_type optionally loads the array in "READ" or "WRITE" only modes. #' @return tiledb_dense array object #' @export -tiledb_dense <- function(ctx, uri, query_type = c("READ", "WRITE")) { +tiledb_dense <- function(ctx, uri, query_type = c("READ", "WRITE"), as.data.frame=TRUE) { query_type = match.arg(query_type) if (missing(ctx) || !is(ctx, "tiledb_ctx")) { stop("argument ctx must be a tiledb_ctx") @@ -24,7 +24,7 @@ tiledb_dense <- function(ctx, uri, query_type = c("READ", "WRITE")) { stop("array URI must be a dense array") } array_xptr <- libtiledb_array_close(array_xptr) - new("tiledb_dense", ctx = ctx, uri = uri, ptr = array_xptr) + new("tiledb_dense", ctx = ctx, uri = uri, as.data.frame = as.data.frame, ptr = array_xptr) } setMethod("show", "tiledb_dense", @@ -132,7 +132,7 @@ subarray_dim <- function(sub) { return(sub_dim) } -attribute_buffers <- function(sch, dom, sub, filter_attributes=list()) { +attribute_buffers <- function(array, sch, dom, sub, filter_attributes=list()) { stopifnot(is(sch, "tiledb_array_schema")) stopifnot(is(dom, "tiledb_domain")) sub_dim <- subarray_dim(sub) @@ -140,17 +140,34 @@ attribute_buffers <- function(sch, dom, sub, filter_attributes=list()) { is_scalar <- all(sub_dim == 1L) attributes <- list() + + # first alloc coordinate buffer if we are returning a data.frame + if(array@as.data.frame) { + ncells_coords <- libtiledb_array_max_buffer_elements(array@ptr, sub, libtiledb_coords()) + if (is.integral(dom)) { + attributes[["coords"]] <- integer(length = ncells_coords) + } else { + attributes[["coords"]] <- numeric(length = ncells_coords) + } + } + attrs <- tiledb::attrs(sch) if (length(filter_attributes) > 0) { attrs <- Filter(function(a) is.element(name(a), filter_attributes), attrs) } for(attr in attrs) { + aname <- tiledb::name(attr) type <- tiledb_datatype_R_type(tiledb::datatype(attr)) + # If we are going to get it as a dataframe we need to use max buffer elements to get proper buffer size + if(array@as.data.frame) { + ncells <- libtiledb_array_max_buffer_elements(array@ptr, sub, aname) + } buff <- vector(mode = type, length = ncells) - if (!is_scalar) { + # If its not scalar and we are not getting it as a data.frame set the dimension attribute + if (!is_scalar && !array@as.data.frame) { attr(buff, "dim") <- sub_dim } - attributes[[tiledb::name(attr)]] <- buff + attributes[[aname]] <- buff } return(attributes) } @@ -169,7 +186,7 @@ setMethod("[", "tiledb_dense", out <- tryCatch( { subarray <- domain_subarray(dom, index = index) - buffers <- attribute_buffers(schema, dom, subarray) + buffers <- attribute_buffers(x, schema, dom, subarray) qry <- libtiledb_query(ctx@ptr, x@ptr, "READ") qry <- libtiledb_query_set_layout(qry, "COL_MAJOR") if (is.integral(dom)) { @@ -179,7 +196,12 @@ setMethod("[", "tiledb_dense", } attr_names <- names(buffers) for (idx in seq_along(buffers)) { - qry <- libtiledb_query_set_buffer(qry, attr_names[[idx]], buffers[[idx]]) + aname <- attr_names[[idx]] + if (aname == "coords") { + qry <- libtiledb_query_set_buffer(qry, libtiledb_coords(), buffers[[idx]]) + } else { + qry <- libtiledb_query_set_buffer(qry, aname, buffers[[idx]]) + } } qry <- libtiledb_query_submit(qry) if (libtiledb_query_status(qry) != "COMPLETE") { @@ -191,11 +213,31 @@ setMethod("[", "tiledb_dense", buffers[[i]] <- drop(buffers[[i]]) } } - # if there is only one buffer, don't return a list of attribute buffers - if (length(buffers) == 1L) { - return(buffers[[1L]]) + + # get the actual number of results, instead of realloc + # just modify the vector length so there is no additional copy + for (idx in seq_along(attr_names)) { + old_buffer <- buffers[[idx]] + aname <- attr_names[[idx]] + if (aname == "coords") { + ncells <- libtiledb_query_result_buffer_elements(qry, libtiledb_coords()) + } else { + ncells <- libtiledb_query_result_buffer_elements(qry, aname) + } + if (ncells < length(old_buffer)) { + buffers[[idx]] <- old_buffer[1:ncells] + } + } + + if (x@as.data.frame) { + return(as_data_frame(dom, buffers)) + } else { + # if there is only one buffer, don't return a list of attribute buffers + if (length(buffers) == 1L) { + return(buffers[[1L]]) + } + return(buffers) } - return(buffers) }, finally = { libtiledb_array_close(x@ptr) diff --git a/R/SparseArray.R b/R/SparseArray.R index ea2e698953..89d0e55658 100644 --- a/R/SparseArray.R +++ b/R/SparseArray.R @@ -1,6 +1,6 @@ #' @exportClass "tiledb_sparse" setClass("tiledb_sparse", - slots = list(ctx = "tiledb_ctx", uri = "character", ptr = "externalptr")) + slots = list(ctx = "tiledb_ctx", uri = "character", as.data.frame = "logical", ptr = "externalptr")) #' Constructs a tiledb_sparse object backed by a persisted tiledb array uri #' @@ -11,7 +11,7 @@ setClass("tiledb_sparse", #' @param query_type optionally loads the array in "READ" or "WRITE" only modes. #' @return tiledb_sparse array object #' @export -tiledb_sparse <- function(ctx, uri, query_type = c("READ", "WRITE")) { +tiledb_sparse <- function(ctx, uri, query_type = c("READ", "WRITE"), as.data.frame=TRUE) { query_type = match.arg(query_type) if (missing(ctx) || !is(ctx, "tiledb_ctx")) { stop("argument ctx must be a tiledb_ctx") @@ -26,7 +26,7 @@ tiledb_sparse <- function(ctx, uri, query_type = c("READ", "WRITE")) { stop("array URI must be a sparse array") } array_xptr <- libtiledb_array_close(array_xptr) - new("tiledb_sparse", ctx = ctx, uri = uri, ptr = array_xptr) + new("tiledb_sparse", ctx = ctx, uri = uri, as.data.frame = as.data.frame, ptr = array_xptr) } @@ -64,6 +64,26 @@ sparse_attribute_buffers <- function(array, sch, dom, sub, filter_attributes=lis return(attributes) } +#' Construct a data.frame from query results +as_data_frame <- function(dom, data) { + if (!is(dom, "tiledb_domain")) { + stop("as_data_frame must be called with a tiledb_domain object") + } + # If coordinates are present convert to columns in the data.frame + if (!is.null(data[["coords"]])) { + ndim <- tiledb_ndim(dom) + dimensions <- dimensions(dom) + for (i in seq(1, ndim, 1)) { + dim_name <- name(dimensions[[i]]) + l = list() + l[[dim_name]] = data$coords[seq(i, length(data$coords), ndim)] + data = c(data, l) + } + data$coords <- NULL + } + return(as.data.frame(data)) +} + setMethod("[", "tiledb_sparse", function(x, i, j, ..., drop = FALSE) { index <- nd_index_from_syscall(sys.call(), parent.frame()) @@ -93,7 +113,7 @@ setMethod("[", "tiledb_sparse", if (aname == "coords") { qry <- libtiledb_query_set_buffer(qry, libtiledb_coords(), buffers[[idx]]) } else { - qry <- libtiledb_query_set_buffer(qry, aname, buffers[[idx]]) + qry <- libtiledb_query_set_buffer(qry, aname, buffers[[idx]]) } } qry <- libtiledb_query_submit(qry) @@ -114,11 +134,15 @@ setMethod("[", "tiledb_sparse", buffers[[idx]] <- old_buffer[1:ncells] } } + if (x@as.data.frame) { + return(as_data_frame(dom, buffers)) + } else { # if there is only one buffer, don't return a list of attribute buffers if (length(buffers) == 1L) { return(buffers[[1L]]) } - return(buffers) + return(buffers) + } }, finally = { libtiledb_array_close(x@ptr) @@ -265,7 +289,7 @@ tiledb_subarray <- function(A, subarray_vector, attrs=c()) { if (is.sparse(A)) { buffers <- sparse_attribute_buffers(A, schema, dom, subarray_vector, attrs) } else { - buffers <- attribute_buffers(schema, dom, subarray_vector, attrs) + buffers <- attribute_buffers(A, schema, dom, subarray_vector, attrs) } qry <- libtiledb_query(ctx@ptr, A@ptr, "READ") qry <- libtiledb_query_set_layout(qry, "COL_MAJOR") @@ -297,11 +321,15 @@ tiledb_subarray <- function(A, subarray_vector, attrs=c()) { buffers[[idx]] <- old_buffer[1:ncells] } } - # if there is only one buffer, don't return a list of attribute buffers - if (length(buffers) == 1L) { - return(buffers[[1L]]) + if (A@as.data.frame) { + return(as_data_frame(dom, buffers)) + } else { + # if there is only one buffer, don't return a list of attribute buffers + if (length(buffers) == 1L) { + return(buffers[[1L]]) + } + return(buffers) } - return(buffers) }, finally = { libtiledb_array_close(A@ptr) diff --git a/man/as_data_frame.Rd b/man/as_data_frame.Rd new file mode 100644 index 0000000000..f7fdf3ff4a --- /dev/null +++ b/man/as_data_frame.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SparseArray.R +\name{as_data_frame} +\alias{as_data_frame} +\title{Construct a data.frame from query results} +\usage{ +as_data_frame(dom, data) +} +\description{ +Construct a data.frame from query results +} diff --git a/man/tiledb_dense.Rd b/man/tiledb_dense.Rd index f3b6e9a5a5..5e1a4336b1 100644 --- a/man/tiledb_dense.Rd +++ b/man/tiledb_dense.Rd @@ -4,7 +4,8 @@ \alias{tiledb_dense} \title{Constructs a tiledb_dense object backed by a persisted tiledb array uri} \usage{ -tiledb_dense(ctx, uri, query_type = c("READ", "WRITE")) +tiledb_dense(ctx, uri, query_type = c("READ", "WRITE"), + as.data.frame = TRUE) } \arguments{ \item{ctx}{tiledb_ctx} diff --git a/man/tiledb_sparse.Rd b/man/tiledb_sparse.Rd index c9d0005885..c234bfd0b5 100644 --- a/man/tiledb_sparse.Rd +++ b/man/tiledb_sparse.Rd @@ -4,7 +4,8 @@ \alias{tiledb_sparse} \title{Constructs a tiledb_sparse object backed by a persisted tiledb array uri} \usage{ -tiledb_sparse(ctx, uri, query_type = c("READ", "WRITE")) +tiledb_sparse(ctx, uri, query_type = c("READ", "WRITE"), + as.data.frame = TRUE) } \arguments{ \item{ctx}{tiledb_ctx} diff --git a/man/tiledb_subarray.Rd b/man/tiledb_subarray.Rd index 88d928362c..a1a19549e0 100644 --- a/man/tiledb_subarray.Rd +++ b/man/tiledb_subarray.Rd @@ -7,7 +7,7 @@ tiledb_subarray(A, subarray_vector, attrs = c()) } \arguments{ -\item{A}{tiledb_sparse_array or tiledb_dense_array} +\item{A}{tiledb_sparse or tiledb_dense} \item{subarray_vector}{subarray to query} diff --git a/tests/testthat/test_DenseArray.R b/tests/testthat/test_DenseArray.R index e795e42275..6ad38f1bb6 100644 --- a/tests/testthat/test_DenseArray.R +++ b/tests/testthat/test_DenseArray.R @@ -50,40 +50,40 @@ test_that("Can read / write a simple 1D vector", { setup({ unlink_and_create(tmp) }) - + ctx <- tiledb_ctx() dim <- tiledb_dim(ctx, domain = c(1L, 10L)) dom <- tiledb_domain(ctx, c(dim)) val <- tiledb_attr(ctx, name="val") sch <- tiledb_array_schema(ctx, dom, c(val)) tiledb_array_create(tmp, sch) - - arr <- tiledb_dense(ctx, tmp) + + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) dat <- as.array(as.double(1:10)) arr[] <- dat - - arr <- tiledb_dense(ctx, tmp) + + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) expect_equal(arr[], dat) - + # explicit range enumeration expect_equal(arr[c(3,4,5,6,7)], dat[c(3,4,5,6,7)]) - + # vector range syntax expect_equal(arr[3:7], dat[3:7]) - + # vector range syntax (reversed) # TODO: find a way to efficiently do this # expect_equal(arr[7:3], dat[7:3]) - + # scalar indexing expect_equal(arr[8], dat[8]) - + arr[6] <- 1000 expect_equal(arr[6], 1000) - + arr[7:10] <- c(97, 98, 99, 100) expect_equal(arr[6:10], as.array(c(1000, 97, 98, 99, 100))) - + teardown({ unlink(tmp, recursive = TRUE) }) @@ -104,7 +104,7 @@ test_that("Can read / write a simple 2D matrix", { tiledb_array_create(tmp, sch) dat <- matrix(rnorm(25), 5, 5) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) arr[] <- dat expect_equal(arr[], dat) @@ -143,7 +143,7 @@ test_that("Can read / write a simple 3D matrix", { tiledb_array_create(tmp, sch) dat <- array(rnorm(125), dim = c(5, 5, 5)) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) arr[] <- dat expect_equal(arr[], dat) @@ -178,7 +178,7 @@ test_that("Can read / write 1D multi-attribute array", { sch <- tiledb_array_schema(ctx, dom, c(a1, a2)) tiledb_array_create(tmp, sch) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) a1_dat <- as.array(as.double(1:10)) a2_dat <- as.array(as.double(11:20)) @@ -211,7 +211,7 @@ test_that("Can read / write 2D multi-attribute array", { sch <- tiledb_array_schema(ctx, dom, c(a1, a2)) tiledb_array_create(tmp, sch) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) a1_dat <- array(rnorm(100), dim = c(10, 10)) a2_dat <- array(rnorm(100), dim = c(10, 10)) @@ -251,7 +251,7 @@ test_that("as.array() conversion method", { sch <- tiledb_array_schema(ctx, dom, c(a1)) tiledb_array_create(tmp, sch) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) dat <- as.double(1:10) arr[] <- dat expect_equal(as.array(arr), as.array(dat)) @@ -275,7 +275,7 @@ test_that("as.data.frame() conversion method", { sch <- tiledb_array_schema(ctx, dom, c(a1, a2)) tiledb_array_create(tmp, sch) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) dat <- list(a1 = array(as.double(1:10)), a2 = array(as.double(1:10))) @@ -304,7 +304,7 @@ test_that("test tiledb_subarray read for dense array", { tiledb_array_create(tmp, sch) dat <- matrix(rnorm(25), 5, 5) - arr <- tiledb_dense(ctx, tmp) + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) arr[] <- dat expect_equal(arr[], dat) @@ -326,7 +326,7 @@ test_that("test tiledb_subarray read for dense array with select attributes", { setup({ unlink_and_create(tmp) }) - + ctx <- tiledb_ctx() d1 <- tiledb_dim(ctx, domain = c(1L, 5L)) d2 <- tiledb_dim(ctx, domain = c(1L, 5L)) @@ -335,23 +335,58 @@ test_that("test tiledb_subarray read for dense array with select attributes", { val2 <- tiledb_attr(ctx, name="val2") sch <- tiledb_array_schema(ctx, dom, c(val1, val2)) tiledb_array_create(tmp, sch) - + dat1 <- matrix(rnorm(25), 5, 5) dat2 <- matrix(rnorm(25), 5, 5) - arr <- tiledb_dense(ctx, tmp) - + arr <- tiledb_dense(ctx, tmp, as.data.frame=FALSE) + arr[] <- list(val1=dat1, val2=dat2) expect_equal(arr[]$val1, dat1) expect_equal(arr[]$val2, dat2) - + # explicit range enumeration - show(tiledb_subarray(arr, list(3,5, 3,5), attrs=c("val1"))) expect_equal(tiledb_subarray(arr, list(3,5, 3,5), attrs=c("val1")), dat1[c(3,4,5), c(3,4,5)]) - + # vector range syntax expect_equal(tiledb_subarray(arr, list(1,3,1,3), attrs=c("val2")), dat2[1:3, 1:3]) - + + teardown({ + unlink(tmp, recursive = TRUE) + }) +}) + + +test_that("test tiledb_subarray read for dense array as dataframe", { + tmp <- tempdir() + setup({ + unlink_and_create(tmp) + }) + + ctx <- tiledb_ctx() + d1 <- tiledb_dim(ctx, domain = c(1L, 5L)) + d2 <- tiledb_dim(ctx, domain = c(1L, 5L)) + dom <- tiledb_domain(ctx, c(d1, d2)) + val1 <- tiledb_attr(ctx, name="val1") + val2 <- tiledb_attr(ctx, name="val2") + sch <- tiledb_array_schema(ctx, dom, c(val1, val2)) + tiledb_array_create(tmp, sch) + + dat1 <- matrix(rnorm(25), 5, 5) + dat2 <- matrix(rnorm(25), 5, 5) + arr <- tiledb_dense(ctx, tmp, as.data.frame=TRUE) + + arr[] <- list(val1=dat1, val2=dat2) + expect_equal(arr[]$val1, unlist(as.list(dat1))) + expect_equal(arr[]$val2, unlist(as.list(dat2))) + + # explicit range enumeration + expect_equal(tiledb_subarray(arr, list(3,5, 3,5), attrs=c("val1"))$val1, + unlist(as.list(dat1[c(3,4,5), c(3,4,5)]))) + + # vector range syntax + expect_equal(tiledb_subarray(arr, list(1,3,1,3), attrs=c("val2"))$val2, unlist(as.list(dat2[1:3, 1:3]))) + teardown({ unlink(tmp, recursive = TRUE) }) diff --git a/tests/testthat/test_SparseArray.R b/tests/testthat/test_SparseArray.R index 29bbbed59e..0682ed83e9 100644 --- a/tests/testthat/test_SparseArray.R +++ b/tests/testthat/test_SparseArray.R @@ -49,7 +49,7 @@ test_that("test tiledb_subarray read for sparse array", { tiledb_array_create(tmp, sch) dat <- matrix(rnorm(25), 5, 5) - arr <- tiledb_sparse(ctx, tmp) + arr <- tiledb_sparse(ctx, tmp, as.data.frame=FALSE) I <- c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5) J <- c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5) @@ -88,7 +88,7 @@ test_that("test tiledb_subarray read for sparse array with attribute list", { dat1 <- matrix(rnorm(25), 5, 5) dat2 <- matrix(rnorm(25), 5, 5) - arr <- tiledb_sparse(ctx, tmp) + arr <- tiledb_sparse(ctx, tmp, as.data.frame=FALSE) I <- c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5) J <- c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5) @@ -103,6 +103,46 @@ test_that("test tiledb_subarray read for sparse array with attribute list", { # vector range syntax expect_equal(tiledb_subarray(arr, list(1,3,1,3), attrs=c("val2"))$val2, unlist(as.list(dat2[1:3, 1:3]))) + teardown({ + unlink(tmp, recursive = TRUE) + }) +}) + +test_that("test tiledb_subarray read for sparse array as dataframe", { + tmp <- tempdir() + setup({ + unlink_and_create(tmp) + }) + + ctx <- tiledb_ctx() + d1 <- tiledb_dim(ctx, name="d1", domain = c(1L, 5L)) + d2 <- tiledb_dim(ctx, name="d2", domain = c(1L, 5L)) + dom <- tiledb_domain(ctx, c(d1, d2)) + val <- tiledb_attr(ctx, name="val") + val2 <- tiledb_attr(ctx, name="val2") + + sch <- tiledb_array_schema(ctx, dom, c(val, val2), sparse=TRUE) + tiledb_array_create(tmp, sch) + + dat1 <- matrix(rnorm(25), 5, 5) + dat2 <- matrix(rnorm(25), 5, 5) + + arr <- tiledb_sparse(ctx, tmp, as.data.frame=TRUE) + I <- c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5) + J <- c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5) + + arr[I, J] <- list(val=dat1, val2=dat2) + expect_equal(arr[]$val, unlist(as.list(dat1))) + + # explicit range enumeration + res <- tiledb_subarray(arr, list(3,5, 3,5), attrs=c("val")) + expect_is(res, "data.frame") + expect_equal(res$val, + unlist(as.list(dat1[c(3,4,5), c(3,4,5)]))) + + # vector range syntax + expect_equal(tiledb_subarray(arr, list(1,3,1,3), attrs=c("val2"))$val2, unlist(as.list(dat2[1:3, 1:3]))) + teardown({ unlink(tmp, recursive = TRUE) })