From 174d9c5f79250a887238882da4e7023e1204a679 Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Mon, 25 Sep 2023 07:34:57 -0500 Subject: [PATCH] Support Arrow schema evolution with enumeration on existing column (#591) * Support Arrow schema evolution with enumeration on existing column * Condition new test using arrow on arrow installed during testing * Tweak * Warn and sort enumerations level vector if unsorted * Adjust test setup now that factor level is getting sorted * Update micro version [ci skip] --- .github/workflows/ci.yaml | 4 ++++ DESCRIPTION | 2 +- NEWS.md | 2 +- R/ArraySchemaEvolution.R | 5 +++++ R/TileDBArray.R | 17 +++++++++++++++++ inst/tinytest/test_arrayschemaevolution.R | 15 ++++++++++++--- inst/tinytest/test_tiledbarray.R | 2 ++ 7 files changed, 42 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 62bfea976a..2a5a00844c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,6 +35,10 @@ jobs: run: cat $HOME/work/TileDB-R/TileDB-R/tiledb.Rcheck/00install.out if: failure() + - name: Show test log + run: cat $HOME/work/TileDB-R/TileDB-R/tiledb.Rcheck/00check.log + if: failure() + #- name: Coverage # if: ${{ matrix.os == 'ubuntu-latest' }} # run: ./.github/r-ci.sh coverage diff --git a/DESCRIPTION b/DESCRIPTION index 7f0f49e591..f5d07dc4de 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tiledb Type: Package -Version: 0.21.0.1 +Version: 0.21.0.2 Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")), person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre")) diff --git a/NEWS.md b/NEWS.md index 6266e61909..9305ac58c8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ ## Improvements -* Array schema evolution has been extended to support enumerations (#590) +* Array schema evolution has been extended to support enumerations (#590, #591) # tiledb 0.21.0 diff --git a/R/ArraySchemaEvolution.R b/R/ArraySchemaEvolution.R index e19cecab69..7bf4ad9a50 100644 --- a/R/ArraySchemaEvolution.R +++ b/R/ArraySchemaEvolution.R @@ -101,6 +101,11 @@ tiledb_array_schema_evolution_add_enumeration <- function(object, name, enums, o "The 'enumlist' argument must be a character object" = is.character(enums), "This function needs TileDB 2.17.0 or later" = tiledb_version(TRUE) >= "2.17.0", "The 'ctx' argument must be a Context object" = is(ctx, "tiledb_ctx")) + srted <- sort(enums) + if (!isTRUE(all.equal(enums, srted))) { + warning("Enumeration levels were not sorted so rearranging.") + enums <- srted + } object@ptr <- libtiledb_array_schema_evolution_add_enumeration(ctx@ptr, object@ptr, name, enums, FALSE, ordered) invisible(object) diff --git a/R/TileDBArray.R b/R/TileDBArray.R index e99ab6281a..a097dfa683 100644 --- a/R/TileDBArray.R +++ b/R/TileDBArray.R @@ -923,6 +923,23 @@ setMethod("[", "tiledb_array", if (use_arrow) { rl <- libtiledb_to_arrow(abptr, qryptr, dictionaries) at <- .as_arrow_table(rl) + + ## special case from schema evolution could have added twice so correcting + for (n in colnames(at)) { + v <- at[[n]]$as_vector() + lvls <- levels(v) + if (inherits(v, "factor")) { + vec <- as.integer(v) + vec[vec == - .Machine$integer.max] <- NA_integer_ + if (min(vec, na.rm=TRUE) == 2 && max(vec, na.rm=TRUE) == length(lvls) + 1) { + vec <- vec - 1L + attr(vec, "levels") <- attr(v, "levels") + class(vec) <- class(v) + at[[n]] <- vec + } + } + } + ## if dictionaries are to be injected at the R level, this does it #for (n in names(dictionaries)) { # if (!is.null(dictionaries[[n]])) { diff --git a/inst/tinytest/test_arrayschemaevolution.R b/inst/tinytest/test_arrayschemaevolution.R index 368b8c871a..6a48fd830e 100644 --- a/inst/tinytest/test_arrayschemaevolution.R +++ b/inst/tinytest/test_arrayschemaevolution.R @@ -48,7 +48,7 @@ tiledb_array_schema_evolution_array_evolve(ase, uri) ## Second add enumeration under a name ase <- tiledb_array_schema_evolution() -enums <- c("red", "blue", "green", "orange", "pink") +enums <- c("blue", "green", "orange", "pink", "red") ase <- tiledb_array_schema_evolution_add_enumeration(ase, "frobo", enums) ## Third connect the attribute to the enum and add it back in @@ -56,9 +56,18 @@ attr <- tiledb_attribute_set_enumeration_name(attr, "frobo") ase <- tiledb_array_schema_evolution_add_attribute(ase, attr) tiledb_array_schema_evolution_array_evolve(ase, uri) -## check -arr <- tiledb_array(uri, return_as="data.table") +## check as data.frame +arr <- tiledb_array(uri, return_as="data.frame") res <- arr[] expect_true(is.factor(res$val)) expect_equal(levels(res$val), enums) expect_equal(as.integer(res$val), c(1:5,5:1)) + +## check as arrow +if (!requireNamespace("arrow", quietly=TRUE)) exit_file("No 'arrow' package.") +arr <- tiledb_array(uri, return_as="arrow") +res <- arr[] +v <- res[["val"]]$as_vector() +expect_true(is.factor(v)) +expect_equal(levels(v), enums) +expect_equal(as.integer(v), c(1:5,5:1)) diff --git a/inst/tinytest/test_tiledbarray.R b/inst/tinytest/test_tiledbarray.R index b42c580af0..4cf087f698 100644 --- a/inst/tinytest/test_tiledbarray.R +++ b/inst/tinytest/test_tiledbarray.R @@ -1442,6 +1442,8 @@ if (v[["major"]] == 2L && v[["minor"]] %in% c(4L, 10L, 11L, 12L, 14L)) exit_file ## CI issues at GitHub for r-release on Windows Server 2019 if (getRversion() < "4.3.0" && Sys.info()[["sysname"]] == "Windows") exit_file("Skip remainder for R 4.2.* on Windows") +if (Sys.info()[["sysname"]] == "Darwin") exit_file("Skip remainder on macOS") + ## check for incomplete status on unsuccessful query -- this no longer fails following some changes made #set_allocation_size_preference(128) # too low for penguins to query fully #array <- tiledb_array(uri, return_as="data.frame", query_layout="ROW_MAJOR")