Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure factor additions do not overflow index type #645

Merged
merged 3 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ libtiledb_version <- function() {
.Call(`_tiledb_libtiledb_version`)
}

tiledb_datatype_max_value <- function(datatype) {
.Call(`_tiledb_tiledb_datatype_max_value`, datatype)
}

libtiledb_ctx <- function(config = NULL) {
.Call(`_tiledb_libtiledb_ctx`, config)
}
Expand Down
14 changes: 9 additions & 5 deletions R/TileDBArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -1411,13 +1411,17 @@ setMethod("[<-", "tiledb_array",
"INT8", "INT16", "INT32", "INT64")) {
dictionary <- tiledb_attribute_get_enumeration_vector_ptr(attr, arrptr)
} else {
stop("Unsupported enumeration vector payload of type '%s'", tpstr, call. = FALSE)
stop("Unsupported enumeration vector payload of type '", tpstr, "'.", call. = FALSE)
}
added_enums <- setdiff(new_levels, dictionary)
if (length(added_enums) > 0) {
spdl::debug("[tiledb_array] '[<-' Adding levels '{}' at '{}' ({} + {})",
paste(added_enums, collapse=","), allnames[[k]],
length(dictionary), length(added_enums))
maxval <- tiledb_datatype_max_value(alltypes[k])
spdl::debug("[tiledb_array] '[<-' Adding levels '{}' at '{}' {} ({} + {} ? {})",
paste(added_enums, collapse=","), allnames[k], alltypes[k], length(dictionary), length(added_enums), maxval);
if (length(dictionary) + length(added_enums) > maxval) {
stop(sprintf("For column '%s' cannot add %d factor levels to existing %d for type '%s' with maximum value %d",
colnam, length(added_enums), length(dictionary), alltypes[k], maxval), call. = FALSE)
}
levels <- unique(c(dictionary, new_levels))
is_ordered <- tiledb_attribute_is_ordered_enumeration_ptr(attr, arrptr)
value[[k]] <- factor(value[[k]], levels = levels, ordered = is_ordered)
Expand All @@ -1427,7 +1431,7 @@ setMethod("[<-", "tiledb_array",
arr <- tiledb_array_open(x)
else
arr <- x
ase <- tiledb_array_schema_evolution_extend_enumeration(ase, arr, allnames[[k]], added_enums)
ase <- tiledb_array_schema_evolution_extend_enumeration(ase, arr, allnames[k], added_enums)
tiledb::tiledb_array_schema_evolution_array_evolve(ase, uri)
value[[k]] <- factor(value[[k]], levels = unique(c(dictionary, added_enums)), ordered=is.ordered(value[[k]]))
}
Expand Down
22 changes: 22 additions & 0 deletions inst/tinytest/test_arrayschemaevolution.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,25 @@ expect_silent(fromDataFrame(df2, uri, mode="append", col_index=1))

res <- tiledb_array(uri, return_as="data.frame")[]
expect_equivalent(res, rbind(df1, df2)) # factors in data.frame get releveled too

## check factor additions do not overflow
for (tp in c("INT8", "UINT8")) {
uri <- tempfile()

dom <- tiledb_domain(dims = tiledb_dim("rows", c(1L, 300L), 10L, "INT32"))
attrs <- c(tiledb_attr("a", type = "INT32"),
tiledb_attr("b", type = tp))
schema <- tiledb_array_schema(dom, attrs = attrs, sparse = TRUE)
schema <- tiledb_array_schema_set_enumeration_empty(schema, attrs[[2]], "b")
invisible(tiledb_array_create(uri, schema))

df <- data.frame(rows = 1:50, a = 100 + 0:49, b = factor(paste0("f", 101:150)))
expect_silent(fromDataFrame(df, uri, mode="append", col_index=1))

df <- data.frame(rows = 51:130, a = 200 + 0:79, b = factor(paste0("f", 151:230)))
if (tp == "INT8") {
expect_error(fromDataFrame(df, uri, mode="append", col_index=1)) # errors for INT8
} else {
expect_silent(fromDataFrame(df, uri, mode="append", col_index=1)) # passes for UINT8
}
}
12 changes: 12 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,17 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// tiledb_datatype_max_value
size_t tiledb_datatype_max_value(const std::string& datatype);
RcppExport SEXP _tiledb_tiledb_datatype_max_value(SEXP datatypeSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::string& >::type datatype(datatypeSEXP);
rcpp_result_gen = Rcpp::wrap(tiledb_datatype_max_value(datatype));
return rcpp_result_gen;
END_RCPP
}
// libtiledb_ctx
XPtr<tiledb::Context> libtiledb_ctx(Nullable<XPtr<tiledb::Config>> config);
RcppExport SEXP _tiledb_libtiledb_ctx(SEXP configSEXP) {
Expand Down Expand Up @@ -3589,6 +3600,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_tiledb_tiledb_datatype_string_to_sizeof", (DL_FUNC) &_tiledb_tiledb_datatype_string_to_sizeof, 1},
{"_tiledb_tiledb_datatype_R_type", (DL_FUNC) &_tiledb_tiledb_datatype_R_type, 1},
{"_tiledb_libtiledb_version", (DL_FUNC) &_tiledb_libtiledb_version, 0},
{"_tiledb_tiledb_datatype_max_value", (DL_FUNC) &_tiledb_tiledb_datatype_max_value, 1},
{"_tiledb_libtiledb_ctx", (DL_FUNC) &_tiledb_libtiledb_ctx, 1},
{"_tiledb_libtiledb_ctx_config", (DL_FUNC) &_tiledb_libtiledb_ctx_config, 1},
{"_tiledb_libtiledb_ctx_is_supported_fs", (DL_FUNC) &_tiledb_libtiledb_ctx_is_supported_fs, 2},
Expand Down
17 changes: 16 additions & 1 deletion src/libtiledb.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2017-2023 TileDB Inc.
// Copyright (c) 2017-2024 TileDB Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -491,6 +491,21 @@ NumericVector libtiledb_version() {
Named("patch") = std::get<2>(ver));
}

// [[Rcpp::export]]
size_t tiledb_datatype_max_value(const std::string& datatype) {
tiledb_datatype_t dtype = _string_to_tiledb_datatype(datatype);
switch (dtype) {
case TILEDB_INT8: return std::numeric_limits<int8_t>::max();
case TILEDB_UINT8: return std::numeric_limits<uint8_t>::max();
case TILEDB_INT16: return std::numeric_limits<int16_t>::max();
case TILEDB_UINT16: return std::numeric_limits<uint16_t>::max();
case TILEDB_INT32: return std::numeric_limits<int32_t>::max();
case TILEDB_UINT32: return std::numeric_limits<uint32_t>::max();
case TILEDB_INT64: return std::numeric_limits<int64_t>::max();
case TILEDB_UINT64: return std::numeric_limits<uint64_t>::max();
default: Rcpp::stop("currently unsupported datatype (%s)", datatype);
}
}

/**
* TileDB Context
Expand Down
Loading