Skip to content

Commit

Permalink
Support Arrow schema evolution with enumeration on existing column (#591
Browse files Browse the repository at this point in the history
)

* Support Arrow schema evolution with enumeration on existing column

* Condition new test using arrow on arrow installed during testing

* Tweak

* Warn and sort enumerations level vector if unsorted

* Adjust test setup now that factor level is getting sorted

* Update micro version [ci skip]
  • Loading branch information
eddelbuettel authored Sep 25, 2023
1 parent 1e7bd2f commit 174d9c5
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 5 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ jobs:
run: cat $HOME/work/TileDB-R/TileDB-R/tiledb.Rcheck/00install.out
if: failure()

- name: Show test log
run: cat $HOME/work/TileDB-R/TileDB-R/tiledb.Rcheck/00check.log
if: failure()

#- name: Coverage
# if: ${{ matrix.os == 'ubuntu-latest' }}
# run: ./.github/r-ci.sh coverage
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tiledb
Type: Package
Version: 0.21.0.1
Version: 0.21.0.2
Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays
Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")),
person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre"))
Expand Down
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

## Improvements

* Array schema evolution has been extended to support enumerations (#590)
* Array schema evolution has been extended to support enumerations (#590, #591)


# tiledb 0.21.0
Expand Down
5 changes: 5 additions & 0 deletions R/ArraySchemaEvolution.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ tiledb_array_schema_evolution_add_enumeration <- function(object, name, enums, o
"The 'enumlist' argument must be a character object" = is.character(enums),
"This function needs TileDB 2.17.0 or later" = tiledb_version(TRUE) >= "2.17.0",
"The 'ctx' argument must be a Context object" = is(ctx, "tiledb_ctx"))
srted <- sort(enums)
if (!isTRUE(all.equal(enums, srted))) {
warning("Enumeration levels were not sorted so rearranging.")
enums <- srted
}
object@ptr <- libtiledb_array_schema_evolution_add_enumeration(ctx@ptr, object@ptr, name,
enums, FALSE, ordered)
invisible(object)
Expand Down
17 changes: 17 additions & 0 deletions R/TileDBArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,23 @@ setMethod("[", "tiledb_array",
if (use_arrow) {
rl <- libtiledb_to_arrow(abptr, qryptr, dictionaries)
at <- .as_arrow_table(rl)

## special case from schema evolution could have added twice so correcting
for (n in colnames(at)) {
v <- at[[n]]$as_vector()
lvls <- levels(v)
if (inherits(v, "factor")) {
vec <- as.integer(v)
vec[vec == - .Machine$integer.max] <- NA_integer_
if (min(vec, na.rm=TRUE) == 2 && max(vec, na.rm=TRUE) == length(lvls) + 1) {
vec <- vec - 1L
attr(vec, "levels") <- attr(v, "levels")
class(vec) <- class(v)
at[[n]] <- vec
}
}
}

## if dictionaries are to be injected at the R level, this does it
#for (n in names(dictionaries)) {
# if (!is.null(dictionaries[[n]])) {
Expand Down
15 changes: 12 additions & 3 deletions inst/tinytest/test_arrayschemaevolution.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,26 @@ tiledb_array_schema_evolution_array_evolve(ase, uri)

## Second add enumeration under a name
ase <- tiledb_array_schema_evolution()
enums <- c("red", "blue", "green", "orange", "pink")
enums <- c("blue", "green", "orange", "pink", "red")
ase <- tiledb_array_schema_evolution_add_enumeration(ase, "frobo", enums)

## Third connect the attribute to the enum and add it back in
attr <- tiledb_attribute_set_enumeration_name(attr, "frobo")
ase <- tiledb_array_schema_evolution_add_attribute(ase, attr)
tiledb_array_schema_evolution_array_evolve(ase, uri)

## check
arr <- tiledb_array(uri, return_as="data.table")
## check as data.frame
arr <- tiledb_array(uri, return_as="data.frame")
res <- arr[]
expect_true(is.factor(res$val))
expect_equal(levels(res$val), enums)
expect_equal(as.integer(res$val), c(1:5,5:1))

## check as arrow
if (!requireNamespace("arrow", quietly=TRUE)) exit_file("No 'arrow' package.")
arr <- tiledb_array(uri, return_as="arrow")
res <- arr[]
v <- res[["val"]]$as_vector()
expect_true(is.factor(v))
expect_equal(levels(v), enums)
expect_equal(as.integer(v), c(1:5,5:1))
2 changes: 2 additions & 0 deletions inst/tinytest/test_tiledbarray.R
Original file line number Diff line number Diff line change
Expand Up @@ -1442,6 +1442,8 @@ if (v[["major"]] == 2L && v[["minor"]] %in% c(4L, 10L, 11L, 12L, 14L)) exit_file
## CI issues at GitHub for r-release on Windows Server 2019
if (getRversion() < "4.3.0" && Sys.info()[["sysname"]] == "Windows") exit_file("Skip remainder for R 4.2.* on Windows")

if (Sys.info()[["sysname"]] == "Darwin") exit_file("Skip remainder on macOS")

## check for incomplete status on unsuccessful query -- this no longer fails following some changes made
#set_allocation_size_preference(128) # too low for penguins to query fully
#array <- tiledb_array(uri, return_as="data.frame", query_layout="ROW_MAJOR")
Expand Down

0 comments on commit 174d9c5

Please sign in to comment.