Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++/python] Fixes for dense arrays and core 2.27/dev #3244

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 47 additions & 12 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,29 @@ def create(

index_column_schema = []
index_column_data = {}

for dim_idx, dim_shape in enumerate(shape):
dim_name = f"soma_dim_{dim_idx}"

pa_field = pa.field(dim_name, pa.int64())
index_column_schema.append(pa_field)

# Here is our Arrow data API for communicating schema info between
# Python/R and C++ libtiledbsoma:
#
# [0] core max domain lo
# [1] core max domain hi
# [2] core extent parameter
# If present, these next two signal to use the current-domain feature:
# [3] core current domain lo
# [4] core current domain hi

if NEW_SHAPE_FEATURE_FLAG_ENABLED and clib.embedded_version_triple() >= (
2,
27,
0,
):

dim_capacity, dim_extent = cls._dim_capacity_and_extent(
dim_name,
# The user specifies current domain -- this is the max domain
Expand Down Expand Up @@ -142,8 +156,7 @@ def create(
TileDBCreateOptions.from_platform_config(platform_config),
)

index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent]
index_column_schema.append(pa_field)
index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent]

index_column_info = pa.RecordBatch.from_pydict(
index_column_data, schema=pa.schema(index_column_schema)
Expand Down Expand Up @@ -250,6 +263,19 @@ def read(
timestamp=handle.timestamp and (0, handle.timestamp),
)

# Scenario to avoid:
# * Query dense array with coords not provided
# * When coords not provided, core uses the core domain
# For old shape:
# * Core current domain did not exist
# * Core max domain may be small; .shape returns this
# For new shape (core 2.27 and tiledbsoma 1.15):
# * Core current domain exists and will be small; .shape returns this
# * Core max domain will be huge
# In either case, applying these coords is the right thing to do
if coords == ():
coords = tuple(slice(0, e - 1) for e in data_shape)

self._set_reader_coords(sr, coords)

arrow_tables = []
Expand Down Expand Up @@ -353,16 +379,25 @@ def _dim_capacity_and_extent(
dim_shape: Optional[int],
create_options: TileDBCreateOptions,
) -> Tuple[int, int]:
"""Given a user-specified shape along a particular dimension, returns a tuple of
the TileDB capacity and extent for that dimension, suitable for schema creation.
The user-specified shape cannot be ``None`` for :class:`DenseNDArray`.
"""Given a user-specified shape (maybe ``None``) along a particular dimension,
returns a tuple of the TileDB capacity and extent for that dimension, suitable
for schema creation. If the user-specified shape is None, the largest possible
int64 is returned for the capacity -- which is particularly suitable for
maxdomain.
"""
if dim_shape is None or dim_shape <= 0:
raise ValueError(
"SOMADenseNDArray shape must be a non-zero-length tuple of positive ints"
)

dim_capacity = dim_shape
dim_extent = min(dim_shape, create_options.dim_tile(dim_name, 2048))
if dim_shape is None:
dim_capacity = 2**63 - 1
dim_extent = min(dim_capacity, create_options.dim_tile(dim_name, 2048))
# For core: "domain max expanded to multiple of tile extent exceeds max value
# representable by domain type. Reduce domain max by 1 tile extent to allow for
# expansion."
dim_capacity -= dim_extent
else:
if dim_shape <= 0:
raise ValueError(
"SOMADenseNDArray shape must be a non-zero-length tuple of positive ints or Nones"
)
dim_capacity = dim_shape
dim_extent = min(dim_shape, create_options.dim_tile(dim_name, 2048))

return (dim_capacity, dim_extent)
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ def create(
dim_name = f"soma_dim_{dim_idx}"

pa_field = pa.field(dim_name, pa.int64())

index_column_schema.append(pa_field)

# Here is our Arrow data API for communicating schema info between
Expand Down Expand Up @@ -504,7 +503,8 @@ def _dim_capacity_and_extent(
"""Given a user-specified shape (maybe ``None``) along a particular dimension,
returns a tuple of the TileDB capacity and extent for that dimension, suitable
for schema creation. If the user-specified shape is None, the largest possible
int64 is returned for the capacity.
int64 is returned for the capacity -- which is particularly suitable for
maxdomain.
"""
if dim_shape is None:
dim_capacity = 2**63 - 1
Expand Down
2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1909,7 +1909,7 @@ def _write_matrix_to_denseNDArray(
else:
tensor = pa.Tensor.from_numpy(chunk.toarray())
if matrix.ndim == 2:
soma_ndarray.write((slice(i, i2), slice(None)), tensor)
soma_ndarray.write((slice(i, i2), slice(0, ncol)), tensor)
else:
soma_ndarray.write((slice(i, i2),), tensor)

Expand Down
10 changes: 8 additions & 2 deletions apis/python/tests/test_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest

import tiledbsoma as soma
import tiledbsoma.pytiledbsoma as clib
from tiledbsoma.options import SOMATileDBContext

from . import NDARRAY_ARROW_TYPES_NOT_SUPPORTED, NDARRAY_ARROW_TYPES_SUPPORTED
Expand Down Expand Up @@ -180,8 +181,13 @@ def test_dense_nd_array_requires_shape(tmp_path, shape_is_numeric):
with soma.DenseNDArray.open(uri) as dnda:
assert dnda.shape == (2, 3)
else:
with pytest.raises(ValueError):
soma.DenseNDArray.create(uri, type=pa.float32(), shape=(None, None)).close()
soma.DenseNDArray.create(uri, type=pa.float32(), shape=(None, None)).close()
with soma.DenseNDArray.open(uri) as dnda:
if (
soma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED
and clib.embedded_version_triple() >= (2, 27, 0)
):
assert dnda.shape == (1, 1)


def test_dense_nd_array_ned_write(tmp_path):
Expand Down
4 changes: 2 additions & 2 deletions apis/r/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ createSchemaFromArrow <- function(uri, nasp, nadimap, nadimsp, sparse, datatype,
invisible(.Call(`_tiledbsoma_createSchemaFromArrow`, uri, nasp, nadimap, nadimsp, sparse, datatype, pclst, ctxxp, tsvec))
}

writeArrayFromArrow <- function(uri, naap, nasp, ctxxp, arraytype = "", config = NULL, tsvec = NULL) {
invisible(.Call(`_tiledbsoma_writeArrayFromArrow`, uri, naap, nasp, ctxxp, arraytype, config, tsvec))
writeArrayFromArrow <- function(uri, naap, nasp, coords_list, ctxxp, arraytype = "", config = NULL, tsvec = NULL) {
invisible(.Call(`_tiledbsoma_writeArrayFromArrow`, uri, naap, nasp, coords_list, ctxxp, arraytype, config, tsvec))
}

#' @noRd
Expand Down
1 change: 1 addition & 0 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ SOMADataFrame <- R6::R6Class(
uri = self$uri,
naap = naap,
nasp = nasp,
coords_list = list(), # only used for SOMADenseNDArray
ctxxp = private$.soma_context,
arraytype = "SOMADataFrame",
config = NULL,
Expand Down
1 change: 1 addition & 0 deletions apis/r/R/SOMADenseNDArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ SOMADenseNDArray <- R6::R6Class(
uri = self$uri,
naap = naap,
nasp = nasp,
coords_list = coords,
ctxxp = private$.soma_context,
arraytype = "SOMADenseNDArray",
config = NULL,
Expand Down
1 change: 1 addition & 0 deletions apis/r/R/SOMASparseNDArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ SOMASparseNDArray <- R6::R6Class(
uri = self$uri,
naap = naap,
nasp = nasp,
coords_list = list(), # only used for SOMADenseNDArray
ctxxp = private$.soma_context,
arraytype = "SOMASparseNDArray",
config = NULL,
Expand Down
9 changes: 5 additions & 4 deletions apis/r/src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,19 @@ BEGIN_RCPP
END_RCPP
}
// writeArrayFromArrow
void writeArrayFromArrow(const std::string& uri, naxpArray naap, naxpSchema nasp, Rcpp::XPtr<somactx_wrap_t> ctxxp, const std::string arraytype, Rcpp::Nullable<Rcpp::CharacterVector> config, Rcpp::Nullable<Rcpp::DatetimeVector> tsvec);
RcppExport SEXP _tiledbsoma_writeArrayFromArrow(SEXP uriSEXP, SEXP naapSEXP, SEXP naspSEXP, SEXP ctxxpSEXP, SEXP arraytypeSEXP, SEXP configSEXP, SEXP tsvecSEXP) {
void writeArrayFromArrow(const std::string& uri, naxpArray naap, naxpSchema nasp, Rcpp::List coords_list, Rcpp::XPtr<somactx_wrap_t> ctxxp, const std::string arraytype, Rcpp::Nullable<Rcpp::CharacterVector> config, Rcpp::Nullable<Rcpp::DatetimeVector> tsvec);
RcppExport SEXP _tiledbsoma_writeArrayFromArrow(SEXP uriSEXP, SEXP naapSEXP, SEXP naspSEXP, SEXP coords_listSEXP, SEXP ctxxpSEXP, SEXP arraytypeSEXP, SEXP configSEXP, SEXP tsvecSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP);
Rcpp::traits::input_parameter< naxpArray >::type naap(naapSEXP);
Rcpp::traits::input_parameter< naxpSchema >::type nasp(naspSEXP);
Rcpp::traits::input_parameter< Rcpp::List >::type coords_list(coords_listSEXP);
Rcpp::traits::input_parameter< Rcpp::XPtr<somactx_wrap_t> >::type ctxxp(ctxxpSEXP);
Rcpp::traits::input_parameter< const std::string >::type arraytype(arraytypeSEXP);
Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type config(configSEXP);
Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::DatetimeVector> >::type tsvec(tsvecSEXP);
writeArrayFromArrow(uri, naap, nasp, ctxxp, arraytype, config, tsvec);
writeArrayFromArrow(uri, naap, nasp, coords_list, ctxxp, arraytype, config, tsvec);
return R_NilValue;
END_RCPP
}
Expand Down Expand Up @@ -793,7 +794,7 @@ END_RCPP
static const R_CallMethodDef CallEntries[] = {
{"_tiledbsoma_createSOMAContext", (DL_FUNC) &_tiledbsoma_createSOMAContext, 1},
{"_tiledbsoma_createSchemaFromArrow", (DL_FUNC) &_tiledbsoma_createSchemaFromArrow, 9},
{"_tiledbsoma_writeArrayFromArrow", (DL_FUNC) &_tiledbsoma_writeArrayFromArrow, 7},
{"_tiledbsoma_writeArrayFromArrow", (DL_FUNC) &_tiledbsoma_writeArrayFromArrow, 8},
{"_tiledbsoma_c_group_create", (DL_FUNC) &_tiledbsoma_c_group_create, 4},
{"_tiledbsoma_c_group_open", (DL_FUNC) &_tiledbsoma_c_group_open, 4},
{"_tiledbsoma_c_group_member_count", (DL_FUNC) &_tiledbsoma_c_group_member_count, 1},
Expand Down
71 changes: 53 additions & 18 deletions apis/r/src/arrow.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include <Rcpp/Lighter> // for R interface to C++
#include <Rcpp/Lighter> // for R interface to C++

#include <nanoarrow/r.h> // for C/C++ interface to Arrow (via header exported from the R package)
#include <RcppInt64> // for fromInteger64
Expand Down Expand Up @@ -175,6 +175,7 @@ void writeArrayFromArrow(
const std::string& uri,
naxpArray naap,
naxpSchema nasp,
Rcpp::List coords_list,
Rcpp::XPtr<somactx_wrap_t> ctxxp,
const std::string arraytype = "",
Rcpp::Nullable<Rcpp::CharacterVector> config = R_NilValue,
Expand Down Expand Up @@ -202,23 +203,6 @@ void writeArrayFromArrow(
// shared pointer to TileDB Context from SOMAContext -- not needed here
// std::shared_ptr<tiledb::Context> ctx = sctx->tiledb_ctx();

// // if we hae a coonfig, use it
// std::shared_ptr<tdbs::SOMAContext> somactx;
// if (config.isNotNull()) {
// std::map<std::string, std::string> smap;
// auto config_vec = config.as();
// auto config_names =
// Rcpp::as<Rcpp::CharacterVector>(config_vec.names()); for (auto &name
// : config_names) {
// std::string param = Rcpp::as<std::string>(name);
// std::string value = Rcpp::as<std::string>(config_vec[param]);
// smap[param] = value;
// }
// somactx = std::make_shared<tdbs::SOMAContext>(smap);
// } else {
// somactx = std::make_shared<tdbs::SOMAContext>();
// }

// optional timestamp range
std::optional<tdbs::TimestampRange> tsrng = makeTimestampRange(tsvec);

Expand Down Expand Up @@ -258,6 +242,57 @@ void writeArrayFromArrow(
}

arrup.get()->set_array_data(std::move(schema), std::move(array));

// For dense arrays, we need to specify the subrange for the write.
// If we don't, the core domain is used.
//
// * With the new shape feature (core 2.27, tiledbsoma 1.15) the
// core domain (soma maxdomain) is huge while the core current domain
// (soma domain) is small.
//
// * It's important to be able to write subarrays. E.g. a dense 2D
// array is 1,000,000 x 60,000 but we want to write the first 3000
// rows.

if (arraytype == "SOMADenseNDArray") {
auto dim_names = arrup->dimension_names();

if (dim_names.size() != coords_list.length()) {
// This is internal error not used error since this should have been
// a stopifnot at the R level, caught already before we got here.
Rcpp::stop(tfm::format(
"dense array write: internal error: ndim %d != ncoord %d",
dim_names.size(),
coords_list.length()));
}

for (int i = 0; i < dim_names.size(); i++) {
auto dim_name = dim_names[i];
std::vector<int> slot_values = Rcpp::as<std::vector<int>>(
coords_list[i]);
int lo = *std::min_element(slot_values.begin(), slot_values.end());
int hi = *std::max_element(slot_values.begin(), slot_values.end());
spdl::debug(
"dense array write: dim {} set 1-up range lo {} hi {}",
dim_name,
lo,
hi);
// These are 1-up indices from R. Convert to 0-up for C++.
if (lo < 1) {
Rcpp::stop(tfm::format(
"dense array write: expected lower bound %d >= 1 for dim "
"name %s",
lo,
dim_name));
}
lo--;
hi--;
std::pair<int64_t, int64_t> lo_hi(int64_t{lo}, int64_t{hi});
std::vector<std::pair<int64_t, int64_t>> range({lo_hi});
arrup.get()->set_dim_ranges(dim_name, range);
}
}

arrup.get()->write();
arrup.get()->close();
}
42 changes: 40 additions & 2 deletions libtiledbsoma/src/soma/managed_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,46 @@ void ManagedQuery::setup_read() {
ctx_->ptr().get(), array_->ptr().get(), 0, &ned, &is_empty));

std::pair<int64_t, int64_t> array_shape;
if (is_empty == 1) {
array_shape = schema.domain().dimension(0).domain<int64_t>();
if (is_empty) {
// Before the new-shape feature (core 2.26 for sparse, core 2.27
// for dense, and tiledbsoma 1.15):
//
// * Core current domain did not exist
//
// * Core domain was small for tiledbsoma DenseNDArray --
// dimensioned
// like the experiment's obs/var counts
//
// With the new-shape feature (core 2.26, and tiledbsoma 1.15):
//
// * Core current domain exists and is small -- dimensioned
// like the experiment's obs/var counts
//
// * Core domain is huge -- a bit shy of 2**63
//
// We need to find out which one is the case.
//
// Note that SOMAArray has a one-line accessor for this
// but we are not operating within that context.

auto current_domain =
tiledb::ArraySchemaExperimental::current_domain(
*ctx_, array_->schema());
auto dim0 = schema.domain().dimension(0);
if (current_domain.is_empty()) {
// Core current domain does not exist; use core domain.
array_shape = dim0.domain<int64_t>();
} else {
// Core current domain exists; use it.
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError(
"found non-rectangle current-domain type");
}
NDRectangle ndrect = current_domain.ndrectangle();
std::array<int64_t, 2> arr = ndrect.range<int64_t>(
dim0.name());
array_shape = std::pair<int64_t, int64_t>(arr[0], arr[1]);
}
} else {
array_shape = std::make_pair(ned[0], ned[1]);
}
Expand Down
Loading