Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct nullable string validity map treatment #517

Merged
merged 14 commits into from
Feb 20, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
^\.Rproj\.user$
^\.travis\.yml$
^_pgkdown.yml
^.*\.tar\.gz$
^tiledb.*\.tar\.gz$
^docs
^_pkgdown.yml$
^inst/examples/quickstart_dense$
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

* This release of the R package builds against [TileDB 2.14.1](https://github.com/TileDB-Inc/TileDB/releases/tag/2.14.1), and has also been tested against earlier releases as well as the development version (#502).

## Breaking Changes

* The validity map coding of nullable strings has been corrected: validity map values of one are now interpreted as valid/non-null for full compatibility with other TileDB projects. Previously written arrays with nullable strings can be read by setting the config option `r.legacy_validity_mode` to `true`; the option also permits to write to an older installation. A conversion helper script is provided in `scripts/legacy_validity_convert.r`. (#517)

## Improvements

* Attributes can now be created, written and read from in (explicit) UTF8 types (and CHAR and ASCII already behaved correctly with respect to utf8 data) (#510)
Expand Down
12 changes: 10 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,16 @@ libtiledb_query_buffer_var_char_alloc_direct <- function(szoffsets, szdata, null
.Call(`_tiledb_libtiledb_query_buffer_var_char_alloc_direct`, szoffsets, szdata, nullable, cols)
}

libtiledb_query_buffer_var_char_create <- function(vec, nullable) {
.Call(`_tiledb_libtiledb_query_buffer_var_char_create`, vec, nullable)
libtiledb_query_buffer_var_char_get_legacy_validity_value <- function(ctx, validity_override = FALSE) {
.Call(`_tiledb_libtiledb_query_buffer_var_char_get_legacy_validity_value`, ctx, validity_override)
}

libtiledb_query_buffer_var_char_legacy_validity_mode <- function(ctx, buf, validity_override = FALSE) {
.Call(`_tiledb_libtiledb_query_buffer_var_char_legacy_validity_mode`, ctx, buf, validity_override)
}

libtiledb_query_buffer_var_char_create <- function(vec, nullable, legacy_validity = FALSE) {
.Call(`_tiledb_libtiledb_query_buffer_var_char_create`, vec, nullable, legacy_validity)
}

libtiledb_query_set_buffer_var_char <- function(query, attr, bufptr) {
Expand Down
6 changes: 4 additions & 2 deletions R/TileDBArray.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2022 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -836,6 +836,7 @@ setMethod("[", "tiledb_array",
if (type %in% c("CHAR", "ASCII", "UTF8")) {
spdl::debug("[getBuffer] '{}' allocating 'char' {} rows given budget of {}", name, resrv, memory_budget)
buf <- libtiledb_query_buffer_var_char_alloc_direct(resrv, memory_budget, nullable)
buf <- libtiledb_query_buffer_var_char_legacy_validity_mode(ctx@ptr, buf)
qryptr <- libtiledb_query_set_buffer_var_char(qryptr, name, buf)
buf
} else {
Expand Down Expand Up @@ -1242,14 +1243,15 @@ setMethod("[<-", "tiledb_array",
else { if (sparse) "UNORDERED" else "COL_MAJOR" })

buflist <- vector(mode="list", length=nc)
legacy_validity <- libtiledb_query_buffer_var_char_get_legacy_validity_value(ctx@ptr)

for (colnam in allnames) {
## when an index column is use this may be unordered to remap to position in 'nm' names
k <- match(colnam, nm)
if (alltypes[k] %in% c("CHAR", "ASCII", "UTF8")) { # variable length
txtvec <- as.character(value[[k]])
spdl::debug("[tiledb_array] '[<-' alloc char buffer {} '{}': {}", k, colnam, alltypes[k])
buflist[[k]] <- libtiledb_query_buffer_var_char_create(txtvec, allnullable[k])
buflist[[k]] <- libtiledb_query_buffer_var_char_create(txtvec, allnullable[k], legacy_validity)
qryptr <- libtiledb_query_set_buffer_var_char(qryptr, colnam, buflist[[k]])
} else {
col <- value[[k]]
Expand Down
72 changes: 71 additions & 1 deletion R/Utils.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2022 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -223,3 +223,73 @@ r_to_tiledb_type <- function(x) {
.assertArray <- function(arr) {
stopifnot(is(arr, "tiledb_sparse") || is(arr, "tiledb_dense") || is(arr, "tiledb_array"))
}

## conversion helper from (and to) legacy validity map for nullable strings
.legacy_validity <- function(inuri,
outdir = NULL,
tolegacy = FALSE,
usetmp = FALSE,
verbose = FALSE,
debug = FALSE) {

stopifnot("'inuri' must be an existing directory" = dir.exists(inuri))

if (verbose)
cat("Running with tiledb R package version", format(packageVersion("tiledb")),
"and TileDB Core version", format(tiledb_version(TRUE)), "\n")

array <- basename(inuri)
if (debug) print(summary(tiledb_array(inuri, strings_as_factors=TRUE)[]))

newdir <- ""
if (isTRUE(usetmp)) newdir <- tempfile()
if (!is.null(outdir)) newdir <- outdir
if (newdir == "")
stop("If '--usetmp' is not given then '--out OUT' must be given.", call. = FALSE)

if (!dir.exists(newdir)) dir.create(newdir)
res <- file.copy(inuri, newdir, recursive=TRUE)
newuri <- file.path(newdir, array)
arr <- tiledb_array(newuri)

attrlst <- attrs(schema(arr))
is_nullable_string <- function(x) datatype(x) %in% c("ASCII", "CHAR", "UTF8") &&
tiledb_attribute_get_nullable(x)
stringcols <- Filter(is_nullable_string, attrlst)
if (length(stringcols) == 0) {
stop("No string columns in array so nothing to do. Exiting.\n", call. = FALSE)
}

oldcfg <- cfg <- tiledb_config()
cfg["r.legacy_validity_mode"] <- if (tolegacy) "false" else "true"
ctx <- tiledb_ctx(cfg)
dat <- tiledb_array(newuri, return_as="data.frame", strings_as_factors=TRUE)[]
if (debug) print(summary(dat))

## delete fully via an 'always true' condition testing for 'FALSE || TRUE'
colnm <- names(stringcols)[1] # we know from above that this is string column
cnd <- sprintf(r"(%s == "" || %s != "")", colnm, colnm)
qc <- do.call(tiledb::parse_query_condition, list(expr = str2lang(cnd), ta = arr))
arr <- tiledb_array(newuri)
qry <- tiledb_query(arr, "DELETE")
qry <- tiledb_query_set_condition(qry, qc)
tiledb_query_submit(qry)
tiledb_query_finalize(qry)
if (debug) {
cat("Deleted old\n")
print(tiledb_array(newuri)[])
}

cfg["r.legacy_validity_mode"] <- if (tolegacy) "false" else "true"
ctx <- tiledb_ctx(cfg)
fromDataFrame(dat[,-1], newuri, mode="append") # unless we say append complains about exists

chk <- tiledb_array(newuri, strings_as_factors=TRUE)[]
if (debug) {
cat("Written back.\n")
print(summary(chk))
}
if (verbose) cat("Done.\n")
ctx <- tiledb_ctx(oldcfg) # reset
invisible()
}
1 change: 1 addition & 0 deletions inst/include/tiledb.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ struct var_length_char_buffer {
int32_t rows, cols; // dimension from subarray
bool nullable; // flag
std::vector<uint8_t> validity_map; // for nullable vectors
bool legacy_validity; // for legacy validity mode
};
typedef struct var_length_char_buffer vlc_buf_t;

Expand Down
Binary file added inst/sampledata/legacy_validity.tar.gz
Binary file not shown.
Binary file added inst/sampledata/legacy_write.tar.gz
Binary file not shown.
23 changes: 23 additions & 0 deletions inst/scripts/legacy_validity_convert.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env Rscript

## load docopt and tiledb
suppressMessages({
library(docopt) # for command-line argument parsing and help generation
library(tiledb)
})

## configuration for docopt
doc <- "Usage: convert.t [-h] [-v] [-d] [-t] [-o OUTDIR | -u] INDIR

-o --out OUTDIR write converted array into OUTDIR, if not given conversion in place [default: ]
-u --usetmp do not require output directory and use a temporary directory [default: FALSE]
-t --tolegacy convert to (instead of from) legacy validity mode
-v --verbose show extra output while processing
-d --debug show extra debug information
-h --help show this help tex
"

opt <- docopt(doc) # docopt parsing
#if (opt$debug) print(opt)

tiledb:::.legacy_validity(opt$INDIR, opt$out, opt$tolegacy, opt$usetmp, opt$verbose, opt$debug)
7 changes: 4 additions & 3 deletions inst/tinytest/test_query.R
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,13 @@ uri <- tempfile()
pp <- palmerpenguins::penguins
fromDataFrame(pp, uri, sparse = TRUE, col_index = c("species", "year"))

qc <- parse_query_condition(body_mass_g > 4000 && sex == "male")
qc <- parse_query_condition(body_mass_g > 4000 || island == "Biscoe" || sex == "male")
arr <- tiledb_array(uri)
qry <- tiledb_query(arr, "DELETE")
qry <- tiledb_query_set_condition(qry, qc)
tiledb_query_submit(qry)
tiledb_query_finalize(qry)

oo <- tiledb_array(uri, return_as="data.frame")[]
expect_equal(nrow(oo), 177) # instead of 344 pre-deletion
oo <- tiledb_array(uri, return_as="data.frame", strings_as_factors=TRUE)[]

expect_equal(nrow(oo), 84) # instead of 344 pre-deletion
48 changes: 48 additions & 0 deletions inst/tinytest/test_tiledbarray.R
Original file line number Diff line number Diff line change
Expand Up @@ -1472,3 +1472,51 @@ oo <- penguins
expect_equal(sum(is.na(oo$sex)), sum(is.na(pp$sex)))
expect_equal(sum(oo$sex == "male"), sum(pp$sex == "male"))
expect_equal(sum(oo$sex == "female"), sum(pp$sex == "female"))


## [214] legacy validity mode
tdir <- tempfile()
tgzfile <- system.file("sampledata", "legacy_validity.tar.gz", package="tiledb")
untar(tarfile = tgzfile, exdir = tdir)
uri <- file.path(tdir, "legacy_validity")
cfg <- tiledb_config()
oldcfg <- cfg
cfg["r.legacy_validity_mode"] <- "true"
ctx <- tiledb_ctx(cfg)
arr <- tiledb_array(uri, strings_as_factors=FALSE, return_as="data.frame")[]
expect_equal(dim(arr)[1], 10)
expect_equal(dim(arr)[2], 3)
expect_equivalent(arr, data.frame(key=1:10,
val1=c(letters[1:4], NA, letters[6:7], NA, letters[9:10]),
val2=LETTERS[1:10]))
expect_equal(arr$val1, c(letters[1:4], NA, letters[6:7], NA, letters[9:10]))
ctx <- tiledb_ctx(oldcfg) # reset config

## [218] test conversion
tdir <- tempfile()
tgzfile <- system.file("sampledata", "legacy_write.tar.gz", package="tiledb")
untar(tarfile = tgzfile, exdir = tdir)
inuri <- file.path(tdir, "legacy_write", "penguins")

outdir <- tempfile()
dir.create(outdir)
cfg["r.legacy_validity_mode"] <- "false" # reset to no conversion to read 'before'
ctx <- tiledb_ctx(cfg)
before <- tiledb_array(inuri, strings_as_factors=TRUE)[]
expect_equal(sum(is.na(before$sex)), 333)

tiledb:::.legacy_validity(inuri, outdir)
outuri <- file.path(outdir, "penguins")
after <- tiledb_array(outuri, strings_as_factors=TRUE)[]
expect_equal(sum(is.na(after$sex)), 11)
for (col in colnames(before)[-c(1,8)]) # exclude __tiledb_rows and sex
expect_equal(before[[col]], after[[col]])

newout <- tempfile()
dir.create(newout)
tiledb:::.legacy_validity(outuri, newout, tolegacy=TRUE)
rvturi <- file.path(newout, "penguins")
revert <- tiledb_array(rvturi, strings_as_factors=TRUE)[]
expect_equal(sum(is.na(revert$sex)), 333)
for (col in colnames(before)[-c(1,8)]) # exclude __tiledb_rows
expect_equal(before[[col]], revert[[col]])
36 changes: 32 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1639,15 +1639,41 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// libtiledb_query_buffer_var_char_get_legacy_validity_value
bool libtiledb_query_buffer_var_char_get_legacy_validity_value(XPtr<tiledb::Context> ctx, bool validity_override);
RcppExport SEXP _tiledb_libtiledb_query_buffer_var_char_get_legacy_validity_value(SEXP ctxSEXP, SEXP validity_overrideSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< XPtr<tiledb::Context> >::type ctx(ctxSEXP);
Rcpp::traits::input_parameter< bool >::type validity_override(validity_overrideSEXP);
rcpp_result_gen = Rcpp::wrap(libtiledb_query_buffer_var_char_get_legacy_validity_value(ctx, validity_override));
return rcpp_result_gen;
END_RCPP
}
// libtiledb_query_buffer_var_char_legacy_validity_mode
XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_legacy_validity_mode(XPtr<tiledb::Context> ctx, XPtr<vlc_buf_t> buf, bool validity_override);
RcppExport SEXP _tiledb_libtiledb_query_buffer_var_char_legacy_validity_mode(SEXP ctxSEXP, SEXP bufSEXP, SEXP validity_overrideSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< XPtr<tiledb::Context> >::type ctx(ctxSEXP);
Rcpp::traits::input_parameter< XPtr<vlc_buf_t> >::type buf(bufSEXP);
Rcpp::traits::input_parameter< bool >::type validity_override(validity_overrideSEXP);
rcpp_result_gen = Rcpp::wrap(libtiledb_query_buffer_var_char_legacy_validity_mode(ctx, buf, validity_override));
return rcpp_result_gen;
END_RCPP
}
// libtiledb_query_buffer_var_char_create
XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_create(CharacterVector vec, bool nullable);
RcppExport SEXP _tiledb_libtiledb_query_buffer_var_char_create(SEXP vecSEXP, SEXP nullableSEXP) {
XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_create(CharacterVector vec, bool nullable, bool legacy_validity);
RcppExport SEXP _tiledb_libtiledb_query_buffer_var_char_create(SEXP vecSEXP, SEXP nullableSEXP, SEXP legacy_validitySEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type vec(vecSEXP);
Rcpp::traits::input_parameter< bool >::type nullable(nullableSEXP);
rcpp_result_gen = Rcpp::wrap(libtiledb_query_buffer_var_char_create(vec, nullable));
Rcpp::traits::input_parameter< bool >::type legacy_validity(legacy_validitySEXP);
rcpp_result_gen = Rcpp::wrap(libtiledb_query_buffer_var_char_create(vec, nullable, legacy_validity));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -3312,7 +3338,9 @@ static const R_CallMethodDef CallEntries[] = {
{"_tiledb_libtiledb_query_set_subarray", (DL_FUNC) &_tiledb_libtiledb_query_set_subarray, 2},
{"_tiledb_libtiledb_query_set_buffer", (DL_FUNC) &_tiledb_libtiledb_query_set_buffer, 3},
{"_tiledb_libtiledb_query_buffer_var_char_alloc_direct", (DL_FUNC) &_tiledb_libtiledb_query_buffer_var_char_alloc_direct, 4},
{"_tiledb_libtiledb_query_buffer_var_char_create", (DL_FUNC) &_tiledb_libtiledb_query_buffer_var_char_create, 2},
{"_tiledb_libtiledb_query_buffer_var_char_get_legacy_validity_value", (DL_FUNC) &_tiledb_libtiledb_query_buffer_var_char_get_legacy_validity_value, 2},
{"_tiledb_libtiledb_query_buffer_var_char_legacy_validity_mode", (DL_FUNC) &_tiledb_libtiledb_query_buffer_var_char_legacy_validity_mode, 3},
{"_tiledb_libtiledb_query_buffer_var_char_create", (DL_FUNC) &_tiledb_libtiledb_query_buffer_var_char_create, 3},
{"_tiledb_libtiledb_query_set_buffer_var_char", (DL_FUNC) &_tiledb_libtiledb_query_set_buffer_var_char, 3},
{"_tiledb_libtiledb_query_get_buffer_var_char", (DL_FUNC) &_tiledb_libtiledb_query_get_buffer_var_char, 3},
{"_tiledb_libtiledb_query_get_buffer_var_char_simple", (DL_FUNC) &_tiledb_libtiledb_query_get_buffer_var_char_simple, 1},
Expand Down
47 changes: 41 additions & 6 deletions src/libtiledb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2691,17 +2691,41 @@ XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_alloc_direct(double szoffsets, d
buf->cols = cols;
buf->nullable = nullable;
buf->validity_map.resize(static_cast<size_t>(szdata));
buf->legacy_validity = false; // for legacy validity mode
return buf;
}

// [[Rcpp::export]]
bool libtiledb_query_buffer_var_char_get_legacy_validity_value(XPtr<tiledb::Context> ctx,
bool validity_override = false) {
check_xptr_tag<tiledb::Context>(ctx);
XPtr<tiledb::Config> cfg = libtiledb_ctx_config(ctx);
Rcpp::CharacterVector vec = libtiledb_config_get(cfg, "r.legacy_validity_mode");
bool legacy_validity = std::string("true") == std::string(vec[0]) || validity_override;
return legacy_validity;
}

// [[Rcpp::export]]
XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_legacy_validity_mode(XPtr<tiledb::Context> ctx,
XPtr<vlc_buf_t> buf,
bool validity_override = false) {
buf->legacy_validity = libtiledb_query_buffer_var_char_get_legacy_validity_value(ctx,
validity_override);
spdl::debug(tfm::format("[libtiledb_query_buffer_var_char_legacy_validity_mode] "
"legacy_validity set to %s", buf->legacy_validity ? "true" : "false"));
return buf;
}

// assigning (for a write) allocates
// [[Rcpp::export]]
XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_create(CharacterVector vec, bool nullable) {
XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_create(CharacterVector vec, bool nullable,
bool legacy_validity = false) {
size_t n = vec.size();
XPtr<vlc_buf_t> bufptr = make_xptr<vlc_buf_t>(new vlc_buf_t);
bufptr->offsets.resize(n);
bufptr->validity_map.resize(n);
bufptr->nullable = nullable;
bufptr->legacy_validity = legacy_validity;
bufptr->str = "";
uint64_t cumlen = 0;
for (size_t i=0; i<n; i++) {
Expand All @@ -2710,7 +2734,11 @@ XPtr<vlc_buf_t> libtiledb_query_buffer_var_char_create(CharacterVector vec, bool
bufptr->str += s;
cumlen += s.length();
if (nullable) {
bufptr->validity_map[i] = vec[i] == NA_STRING;
if (legacy_validity) {
bufptr->validity_map[i] = vec[i] == R_NaString;
} else {
bufptr->validity_map[i] = vec[i] != R_NaString;
}
}
}
bufptr->rows = bufptr->cols = 0; // signal unassigned for the write case
Expand Down Expand Up @@ -2763,10 +2791,17 @@ CharacterMatrix libtiledb_query_get_buffer_var_char(XPtr<vlc_buf_t> bufptr,
CharacterMatrix mat(bufptr->rows, bufptr->cols);
for (size_t i = 0; i < n; i++) {
if (bufptr->nullable) {
if (bufptr->validity_map[i] == 0)
mat[i] = std::string(&bufptr->str[bufptr->offsets[i]], str_sizes[i]);
else
mat[i] = R_NaString;
if (bufptr->legacy_validity) {
if (bufptr->validity_map[i] == 0)
mat[i] = std::string(&bufptr->str[bufptr->offsets[i]], str_sizes[i]);
else
mat[i] = R_NaString;
} else {
if (bufptr->validity_map[i] != 0)
mat[i] = std::string(&bufptr->str[bufptr->offsets[i]], str_sizes[i]);
else
mat[i] = R_NaString;
}
} else {
mat[i] = std::string(&bufptr->str[bufptr->offsets[i]], str_sizes[i]);
}
Expand Down
2 changes: 1 addition & 1 deletion src/shmem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ XPtr<vlc_buf_t> vlcbuf_from_shmem(std::string datapath, std::string dtype) {
buf->rows = buf->offsets.size();
buf->cols = 2; // value not used
buf->nullable = false; // default, overridden below if validity path used

buf->legacy_validity = false; // may need to open door to config option here too
if (debug) Rcpp::Rcout << datapath << " " << offsetspath
<< " data:" << buf->str.size()
<< " offsets:" << buf->offsets.size();
Expand Down