From 24a74384964c1f7a62938dacb3a644f7488bba6f Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 28 Oct 2024 13:53:05 -0400 Subject: [PATCH] [python] Bindings for `upgrade_domain` (#3235) * [python/r] Bindings for `upgrade_domain` * unit-test coverage * update checking for none-slots for string index columns * code-review feedback Co-authored-by: nguyenv --------- Co-authored-by: nguyenv --- apis/python/src/tiledbsoma/_dataframe.py | 109 ++++++++++++++++- apis/python/src/tiledbsoma/_tdb_handles.py | 50 ++++++++ apis/python/src/tiledbsoma/soma_dataframe.cc | 104 ++++++++++++++++ apis/python/tests/test_shape.py | 119 +++++++++++++++++++ libtiledbsoma/src/soma/soma_array.h | 69 ++++++++--- libtiledbsoma/test/unit_soma_dataframe.cc | 24 ++-- 6 files changed, 441 insertions(+), 34 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index fb59049922..8831dc4ae2 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -7,7 +7,17 @@ Implementation of a SOMA DataFrame """ import inspect -from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast +from typing import ( + Any, + Dict, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) import numpy as np import pyarrow as pa @@ -484,6 +494,103 @@ def tiledbsoma_upgrade_soma_joinid_shape( ) return (True, "") + def _upgrade_or_change_domain_helper( + self, newdomain: Domain, function_name_for_messages: str + ) -> Any: + """Converts the user-level tuple of low/high pairs into a pyarrow table suitable for calling libtiledbsoma.""" + + # Check user-provided domain against dataframe domain. + dim_names = self._tiledb_dim_names() + if len(dim_names) != len(newdomain): + raise ValueError( + f"{function_name_for_messages}: requested domain has length {len(dim_names)} but the dataframe's schema has index-column count {len(newdomain)}" + ) + + if any([slot is not None and len(slot) != 2 for slot in newdomain]): + raise ValueError( + f"{function_name_for_messages}: requested domain must have low,high pairs, or `None`, in each slot" + ) + + # From the dataframe's schema, extract the subschema for only index columns (TileDB dimensions). + full_schema = self.schema + dim_schema_list = [] + for dim_name in dim_names: + dim_schema_list.append(full_schema.field(dim_name)) + dim_schema = pa.schema(dim_schema_list) + + # Convert the user's tuple of low/high pairs into a dict keyed by index-column name. + new_domain_dict: Dict[str, Domain] = {} + for dim_name, new_dom in zip(dim_names, newdomain): + # Domain can't be specified for strings (core constraint) so let them keystroke that easily. + if ( + dim_schema.field(dim_name).type + in [ + pa.string(), + pa.large_string(), + pa.binary(), + pa.large_binary(), + ] + and new_dom is None + ): + new_domain_dict[dim_name] = ("", "") # type: ignore + else: + new_domain_dict[dim_name] = tuple(new_dom) # type: ignore + + # Return this as a pyarrow table. This has n columns where n is the number of + # index columns, and two rows: one row for the low values and one for the high values. + return pa.RecordBatch.from_pydict(new_domain_dict, schema=dim_schema) + + def tiledbsoma_upgrade_domain( + self, newdomain: Domain, check_only: bool = False + ) -> StatusAndReason: + """Allows you to set the domain of a SOMA :class:`DataFrame``, when the + ``DataFrame`` does not have a domain set yet. The argument must be a + tuple of pairs of low/high values for the desired domain, one pair per + index column. For string index columns, you must offer the low/high pair + as `("", "")`. If ``check_only`` is ``True``, returns whether the + operation would succeed if attempted, and a reason why it would not. + """ + pyarrow_domain_table = self._upgrade_or_change_domain_helper( + newdomain, "tiledbsoma_upgrade_domain" + ) + + if check_only: + return cast( + StatusAndReason, + self._handle._handle.can_upgrade_domain( + pyarrow_domain_table, "tiledbsoma_upgrade_domain" + ), + ) + else: + self._handle._handle.upgrade_domain( + pyarrow_domain_table, "tiledbsoma_upgrade_domain" + ) + return (True, "") + + def change_domain( + self, newdomain: Domain, check_only: bool = False + ) -> StatusAndReason: + """Allows you to enlarge the domain of a SOMA :class:`DataFrame``, when + the ``DataFrame`` already has a domain. The argument must be a tuple of + pairs of low/high values for the desired domain, one pair per index + column. For string index columns, you must offer the low/high pair as + `("", "")`. If ``check_only`` is ``True``, returns whether the + operation would succeed if attempted, and a reason why it would not. + """ + pyarrow_domain_table = self._upgrade_or_change_domain_helper( + newdomain, "change_domain" + ) + if check_only: + return cast( + StatusAndReason, + self._handle._handle.can_change_domain( + pyarrow_domain_table, "change_domain" + ), + ) + else: + self._handle._handle.change_domain(pyarrow_domain_table, "change_domain") + return (True, "") + def __len__(self) -> int: """Returns the number of rows in the dataframe. Same as ``df.count``.""" return self.count diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index aadbc11ef2..80bda19881 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -505,6 +505,28 @@ def can_upgrade_soma_joinid_shape( """Only implemented for DataFrame.""" raise NotImplementedError + def upgrade_domain( + self, newdomain: Domain, function_name_for_messages: str + ) -> None: + """Only implemented for DataFrame.""" + raise NotImplementedError + + def can_upgrade_domain( + self, newdomain: Domain, function_name_for_messages: str + ) -> StatusAndReason: + """Only implemented for DataFrame.""" + raise NotImplementedError + + def change_domain(self, newdomain: Domain, function_name_for_messages: str) -> None: + """Only implemented for DataFrame.""" + raise NotImplementedError + + def can_change_domain( + self, newdomain: Domain, function_name_for_messages: str + ) -> StatusAndReason: + """Only implemented for DataFrame.""" + raise NotImplementedError + class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]): """Wrapper around a Pybind11 SOMADataFrame handle.""" @@ -567,6 +589,34 @@ def can_upgrade_soma_joinid_shape( ), ) + def upgrade_domain( + self, newdomain: Domain, function_name_for_messages: str + ) -> None: + """Wrapper-class internals""" + self._handle.upgrade_domain(newdomain, function_name_for_messages) + + def can_upgrade_domain( + self, newdomain: Domain, function_name_for_messages: str + ) -> StatusAndReason: + """Wrapper-class internals""" + return cast( + StatusAndReason, + self._handle.can_upgrade_domain(newdomain, function_name_for_messages), + ) + + def change_domain(self, newdomain: Domain, function_name_for_messages: str) -> None: + """Wrapper-class internals""" + self._handle.change_domain(newdomain, function_name_for_messages) + + def can_change_domain( + self, newdomain: Domain, function_name_for_messages: str + ) -> StatusAndReason: + """Wrapper-class internals""" + return cast( + StatusAndReason, + self._handle.can_change_domain(newdomain, function_name_for_messages), + ) + class PointCloudDataFrameWrapper(SOMAArrayWrapper[clib.SOMAPointCloudDataFrame]): """Wrapper around a Pybind11 SOMAPointCloudDataFrame handle.""" diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index c7075ce564..d55a2ad3d2 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -216,6 +216,110 @@ void load_soma_dataframe(py::module& m) { } }, "newshape"_a, + "function_name_for_messages"_a) + + .def( + "upgrade_domain", + [](SOMADataFrame& sdf, + py::object pyarrow_domain_table, + std::string function_name_for_messages) { + ArrowArray pyarrow_domain_array; + ArrowSchema pyarrow_domain_schema; + uintptr_t nanoarrow_domain_array_ptr = + (uintptr_t)(&pyarrow_domain_array); + uintptr_t nanoarrow_domain_schema_ptr = + (uintptr_t)(&pyarrow_domain_schema); + pyarrow_domain_table.attr("_export_to_c")( + nanoarrow_domain_array_ptr, nanoarrow_domain_schema_ptr); + ArrowTable nanoarrow_domain_table( + std::make_unique(pyarrow_domain_array), + std::make_unique(pyarrow_domain_schema)); + try { + sdf.upgrade_domain( + nanoarrow_domain_table, function_name_for_messages); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }, + "pyarrow_domain_table"_a, + "function_name_for_messages"_a) + + .def( + "can_upgrade_domain", + [](SOMADataFrame& sdf, + py::object pyarrow_domain_table, + std::string function_name_for_messages) { + ArrowArray pyarrow_domain_array; + ArrowSchema pyarrow_domain_schema; + uintptr_t nanoarrow_domain_array_ptr = + (uintptr_t)(&pyarrow_domain_array); + uintptr_t nanoarrow_domain_schema_ptr = + (uintptr_t)(&pyarrow_domain_schema); + pyarrow_domain_table.attr("_export_to_c")( + nanoarrow_domain_array_ptr, nanoarrow_domain_schema_ptr); + ArrowTable nanoarrow_domain_table( + std::make_unique(pyarrow_domain_array), + std::make_unique(pyarrow_domain_schema)); + try { + return sdf.can_upgrade_domain( + nanoarrow_domain_table, function_name_for_messages); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }, + "pyarrow_domain_table"_a, + "function_name_for_messages"_a) + + .def( + "change_domain", + [](SOMADataFrame& sdf, + py::object pyarrow_domain_table, + std::string function_name_for_messages) { + ArrowArray pyarrow_domain_array; + ArrowSchema pyarrow_domain_schema; + uintptr_t nanoarrow_domain_array_ptr = + (uintptr_t)(&pyarrow_domain_array); + uintptr_t nanoarrow_domain_schema_ptr = + (uintptr_t)(&pyarrow_domain_schema); + pyarrow_domain_table.attr("_export_to_c")( + nanoarrow_domain_array_ptr, nanoarrow_domain_schema_ptr); + ArrowTable nanoarrow_domain_table( + std::make_unique(pyarrow_domain_array), + std::make_unique(pyarrow_domain_schema)); + try { + sdf.change_domain( + nanoarrow_domain_table, function_name_for_messages); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }, + "pyarrow_domain_table"_a, + "function_name_for_messages"_a) + + .def( + "can_change_domain", + [](SOMADataFrame& sdf, + py::object pyarrow_domain_table, + std::string function_name_for_messages) { + ArrowArray pyarrow_domain_array; + ArrowSchema pyarrow_domain_schema; + uintptr_t nanoarrow_domain_array_ptr = + (uintptr_t)(&pyarrow_domain_array); + uintptr_t nanoarrow_domain_schema_ptr = + (uintptr_t)(&pyarrow_domain_schema); + pyarrow_domain_table.attr("_export_to_c")( + nanoarrow_domain_array_ptr, nanoarrow_domain_schema_ptr); + ArrowTable nanoarrow_domain_table( + std::make_unique(pyarrow_domain_array), + std::make_unique(pyarrow_domain_schema)); + try { + return sdf.can_change_domain( + nanoarrow_domain_table, function_name_for_messages); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + }, + "pyarrow_domain_table"_a, "function_name_for_messages"_a); } diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py index d96bde6ab9..dbfc651bff 100644 --- a/apis/python/tests/test_shape.py +++ b/apis/python/tests/test_shape.py @@ -358,6 +358,103 @@ def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names): sdf.write(data) +def test_domain_mods(tmp_path): + if not tiledbsoma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED: + return + + uri = tmp_path.as_posix() + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("mystring", pa.string()), + ("myint", pa.int16()), + ("myfloat", pa.float32()), + ("mybool", pa.bool_()), # not supported as an index type + ] + ) + index_column_names = ["soma_joinid", "mystring", "myint", "myfloat"] + + domain_for_create = [ + [0, 3], + None, + [20, 50], + [0.0, 6.0], + ] + + data_dict = { + "soma_joinid": [0, 1, 2, 3], + "mystring": ["a", "b", "a", "b"], + "myint": [20, 30, 40, 50], + "myfloat": [1.0, 2.5, 4.0, 5.5], + "mybool": [True, False, True, True], + } + + data = pa.Table.from_pydict(data_dict) + + with tiledbsoma.DataFrame.create( + uri, + schema=schema, + index_column_names=index_column_names, + domain=domain_for_create, + ) as sdf: + sdf.write(data) + + # Check "expand" to same + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + newdomain = [[0, 3], None, [20, 50], [0.0, 6.0]] + ok, msg = sdf.change_domain(newdomain, check_only=True) + assert ok + assert msg == "" + + # Shrink + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + newdomain = [[0, 2], None, [20, 50], [0.0, 6.0]] + ok, msg = sdf.change_domain(newdomain, check_only=True) + assert not ok + assert "downsize is unsupported" in msg + + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + newdomain = [[0, 3], None, [20, 40], [0.0, 6.0]] + ok, msg = sdf.change_domain(newdomain, check_only=True) + assert not ok + assert "downsize is unsupported" in msg + + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + newdomain = [[0, 3], None, [20, 50], [1.0, 6.0]] + ok, msg = sdf.change_domain(newdomain, check_only=True) + assert not ok + assert "downsize is unsupported" in msg + + # String domain cannot be specified + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + newdomain = [ + [0, 3], + ["a", "z"], + [20, 50], + [0.0, 6.0], + ] + ok, msg = sdf.change_domain(newdomain, check_only=True) + assert not ok + assert "domain cannot be set for string index columns" in msg + + # All clear + with tiledbsoma.DataFrame.open(uri, "w") as sdf: + newdomain = [[0, 9], None, [0, 100], [-10.0, 10.0]] + ok, msg = sdf.change_domain(newdomain, check_only=True) + assert ok + assert msg == "" + sdf.change_domain(newdomain) + + # Check for success + with tiledbsoma.DataFrame.open(uri, "r") as sdf: + dom = sdf.domain + assert dom[0] == (0, 9) + assert dom[1] == ("", "") + assert dom[2] == (0, 100) + assert dom[3] == (-10.0, 10.0) + + @pytest.mark.parametrize("has_shapes", [False, True]) def test_canned_experiments(tmp_path, has_shapes): uri = tmp_path.as_posix() @@ -437,6 +534,28 @@ def _check_ndarray(ndarray, has_shapes, expected_shape): assert "[SparseNDArray] ms/RNA/obsp/distances" in body assert "ms/RNA/obsm/X_draw_graph_fr" in body + # Check upgrade_domain for dataframes + with tiledbsoma.Experiment.open(uri, "w") as exp: + + ok, msg = exp.obs.tiledbsoma_upgrade_domain([[10, 4]], check_only=True) + if has_shapes: + assert not ok + assert "dataframe already has a domain" in msg + else: + assert not ok + assert "new lower > new upper" in msg + + ok, msg = exp.obs.tiledbsoma_upgrade_domain([[0, 1]], check_only=True) + if has_shapes: + assert not ok + assert "dataframe already has a domain" in msg + else: + assert ok + assert msg == "" + + with pytest.raises(ValueError): + exp.obs.tiledbsoma_upgrade_domain([[0, 1, 2]], check_only=True) + # Check dry run of tiledbsoma.io.upgrade_experiment_shapes handle = io.StringIO() upgradeable = tiledbsoma.io.upgrade_experiment_shapes( diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 3bae58a912..77834ffd3b 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -1285,21 +1285,27 @@ class SOMAArray : public SOMAObject { } /** - * This is for SOMADataFrame. - * XXX comment more + * This is for SOMADataFrame. While resize_soma_joinid_shape allows the + * user to do up the soma_joinid domain slot, without needing to specify + * the rest (which is the common operation for experiment-level resize) + * this allows the full-generality resize-every-index-column case + * (which only applies to variant-indexed/non-standard dataframes). */ - void upgrade_domain( + void change_domain( const ArrowTable& newdomain, std::string function_name_for_messages) { - _set_domain_helper(newdomain, false, function_name_for_messages); + _set_domain_helper(newdomain, true, function_name_for_messages); } /** - * This is for SOMADataFrame. - * XXX comment more + * This is for SOMADataFrame. While upgrade_soma_joinid_shape allows the + * user to do up the soma_joinid domain slot, without needing to specify + * the rest (which is the common operation for experiment-level resize) + * this allows the full-generality resize-every-index-column case + * (which only applies to variant-indexed/non-standard dataframes). */ - void change_domain( + void upgrade_domain( const ArrowTable& newdomain, std::string function_name_for_messages) { - _set_domain_helper(newdomain, true, function_name_for_messages); + _set_domain_helper(newdomain, false, function_name_for_messages); } protected: @@ -1447,22 +1453,51 @@ class SOMAArray : public SOMAObject { const T& new_lo = new_lo_hi[0]; const T& new_hi = new_lo_hi[1]; - // It's difficult to use fmt::format within a header file since the - // include path to logger.h 'moves around' depending on which source + // If we're checking against the core current domain: the user-provided + // domain must contain the core current domain. + // + // If we're checking against the core (max) domain: the user-provided + // domain must be contained within the core (max) domain. + + // Note: It's difficult to use fmt::format within a header file since + // the include path to logger.h 'moves around' depending on which source // file included us. // // TODO: once we're on C++ 20, just use std::format here and include // things like "old ({}, {}) new ({}, {})". + if (new_lo > new_hi) { - return std::pair(false, "new lower > new upper"); - } - if (new_lo > old_lo) { return std::pair( - false, "new lower > old lower (downsize is unsupported)"); + false, + "index-column name " + dim_name + ": new lower > new upper"); } - if (new_hi < old_hi) { - return std::pair( - false, "new upper < old upper (downsize is unsupported)"); + + if (check_current_domain) { + if (new_lo > old_lo) { + return std::pair( + false, + "index-column name " + dim_name + + ": new lower > old lower (downsize is unsupported)"); + } + if (new_hi < old_hi) { + return std::pair( + false, + "index-column name " + dim_name + + ": new upper < old upper (downsize is unsupported)"); + } + } else { + if (new_lo < old_lo) { + return std::pair( + false, + "index-column name " + dim_name + + ": new lower < limit lower"); + } + if (new_hi > old_hi) { + return std::pair( + false, + "index-column name " + dim_name + + ": new upper > limit upper"); + } } return std::pair(true, ""); } diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 17272aafeb..2d5b27e23a 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -623,6 +623,7 @@ TEST_CASE_METHOD( domain_schema = create_index_cols_info_schema(dim_infos); auto domain_array = ArrowAdapter::make_arrow_array_parent( dim_infos.size()); + // OK since there currently is no shape set: domain_array->children[0] = ArrowAdapter::make_arrow_array_child( std::vector({0, 0})); auto domain_table = ArrowTable( @@ -630,13 +631,8 @@ TEST_CASE_METHOD( if (!use_current_domain) { StatusAndReason check = sdf->can_upgrade_domain( domain_table, "testing"); - REQUIRE(check.first == false); - REQUIRE( - check.second == - "testing for soma_joinid: new upper < old upper " - "(downsize " - "is " - "unsupported)"); + REQUIRE(check.first == true); + REQUIRE(check.second == ""); } else { StatusAndReason check = sdf->can_upgrade_soma_joinid_shape( 1, "testing"); @@ -999,6 +995,7 @@ TEST_CASE_METHOD( domain_schema = create_index_cols_info_schema(dim_infos); auto domain_array = ArrowAdapter::make_arrow_array_parent( dim_infos.size()); + // OK since there currently is no shape set: domain_array->children[0] = ArrowAdapter::make_arrow_array_child( std::vector({0, 0})); domain_array->children[1] = ArrowAdapter::make_arrow_array_child( @@ -1009,12 +1006,8 @@ TEST_CASE_METHOD( if (!use_current_domain) { StatusAndReason check = sdf->can_upgrade_domain( domain_table, "testing"); - REQUIRE(check.first == false); - REQUIRE( - check.second == - "testing for myuint32: new upper < old upper (downsize " - "is " - "unsupported)"); + REQUIRE(check.first == true); + REQUIRE(check.second == ""); } else { StatusAndReason check = sdf->can_upgrade_soma_joinid_shape( 1, "testing"); @@ -1356,9 +1349,8 @@ TEST_CASE_METHOD( REQUIRE(check.first == false); REQUIRE( check.second == - "testing for soma_joinid: new upper < old upper " - "(downsize is " - "unsupported)"); + "testing for mystring: domain cannot be set for string " + "index columns: please use (\"\", \"\")"); } else { StatusAndReason check = sdf->can_upgrade_soma_joinid_shape( 1, "testing");