diff --git a/tests/test_tensorflow_data_api.py b/tests/test_tensorflow_data_api.py index 66c1ab44..80067df5 100644 --- a/tests/test_tensorflow_data_api.py +++ b/tests/test_tensorflow_data_api.py @@ -8,7 +8,7 @@ import tensorflow as tf import tiledb -from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset +from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset, _generator from .utils import ingest_in_tiledb @@ -255,28 +255,45 @@ def test_dataset_generator_batch_output( attribute_names = [ "features_" + str(attr) for attr in range(num_of_attributes) ] - dataset = TensorflowTileDBDataset( + kwargs = dict( x_array=x, y_array=y, batch_size=BATCH_SIZE, batch_shuffle=batch_shuffle, - buffer_size=buffer_size, within_batch_shuffle=within_batch_shuffle, - x_attribute_names=attribute_names, - y_attribute_names=attribute_names, ) - generated_data = next(iter(dataset)) - assert len(generated_data) == 2 * num_of_attributes - - for attr in range(num_of_attributes): - assert tuple(generated_data[attr].shape) <= ( - BATCH_SIZE, - *input_shape[1:], - ) - assert tuple(generated_data[num_of_attributes + attr].shape) <= ( - BATCH_SIZE, - NUM_OF_CLASSES, - ) + # Test the generator twice: once with the public api (TensorflowTileDBDataset) + # and once with calling _generator directly. Although the former calls the + # latter internally, it is not reported as covered by the coverage report + # due to https://github.com/tensorflow/tensorflow/issues/33759 + generators = [ + iter( + TensorflowTileDBDataset( + x_attribute_names=attribute_names, + y_attribute_names=attribute_names, + buffer_size=buffer_size, + **kwargs + ) + ), + _generator( + x_attrs=attribute_names, + y_attrs=attribute_names, + buffer_size=buffer_size or BATCH_SIZE, + **kwargs + ), + ] + for generator in generators: + generated_data = next(generator) + assert len(generated_data) == 2 * num_of_attributes + for attr in range(num_of_attributes): + assert tuple(generated_data[attr].shape) <= ( + BATCH_SIZE, + *input_shape[1:], + ) + assert tuple(generated_data[num_of_attributes + attr].shape) <= ( + BATCH_SIZE, + NUM_OF_CLASSES, + ) def test_buffer_size_geq_batch_size_exception( self, diff --git a/tests/test_tensorflow_sparse_data_api.py b/tests/test_tensorflow_sparse_data_api.py index 29aff997..76d3e41b 100644 --- a/tests/test_tensorflow_sparse_data_api.py +++ b/tests/test_tensorflow_sparse_data_api.py @@ -8,7 +8,7 @@ import tensorflow as tf import tiledb -from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset +from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset, _generator from .utils import create_sparse_array_one_hot_2d, ingest_in_tiledb @@ -380,28 +380,48 @@ def test_generator_sparse_x_dense_y_batch_output( attribute_names = [ "features_" + str(attr) for attr in range(num_of_attributes) ] - dataset = TensorflowTileDBDataset( + kwargs = dict( x_array=x, y_array=y, batch_size=BATCH_SIZE, - buffer_size=buffer_size, batch_shuffle=batch_shuffle, - x_attribute_names=attribute_names, - y_attribute_names=attribute_names, ) - generated_data = next(iter(dataset)) - assert len(generated_data) == 2 * num_of_attributes - - for attr in range(num_of_attributes): - assert isinstance(generated_data[attr], tf.SparseTensor) - assert isinstance(generated_data[attr + num_of_attributes], tf.Tensor) - - # Coords should be equal to batch for both x and y - assert generated_data[attr].indices.shape[0] <= BATCH_SIZE - assert tuple(generated_data[attr + num_of_attributes].shape) <= ( - BATCH_SIZE, - NUM_OF_CLASSES, - ) + # Test the generator twice: once with the public api (TensorflowTileDBDataset) + # and once with calling _generator directly. Although the former calls the + # latter internally, it is not reported as covered by the coverage report + # due to https://github.com/tensorflow/tensorflow/issues/33759 + generators = [ + iter( + TensorflowTileDBDataset( + x_attribute_names=attribute_names, + y_attribute_names=attribute_names, + buffer_size=buffer_size, + **kwargs + ) + ), + _generator( + x_attrs=attribute_names, + y_attrs=attribute_names, + buffer_size=buffer_size or BATCH_SIZE, + **kwargs + ), + ] + for generator in generators: + generated_data = next(generator) + assert len(generated_data) == 2 * num_of_attributes + + for attr in range(num_of_attributes): + assert isinstance(generated_data[attr], tf.SparseTensor) + assert isinstance( + generated_data[attr + num_of_attributes], tf.Tensor + ) + + # Coords should be equal to batch for both x and y + assert generated_data[attr].indices.shape[0] <= BATCH_SIZE + assert tuple(generated_data[attr + num_of_attributes].shape) <= ( + BATCH_SIZE, + NUM_OF_CLASSES, + ) def test_generator_sparse_x_sparse_y_batch_output( self, tmpdir, input_shape, num_of_attributes, batch_shuffle, buffer_size @@ -429,29 +449,48 @@ def test_generator_sparse_x_sparse_y_batch_output( attribute_names = [ "features_" + str(attr) for attr in range(num_of_attributes) ] - - dataset = TensorflowTileDBDataset( + kwargs = dict( x_array=x, y_array=y, batch_size=BATCH_SIZE, - buffer_size=buffer_size, batch_shuffle=batch_shuffle, - x_attribute_names=attribute_names, - y_attribute_names=attribute_names, ) - generated_data = next(iter(dataset)) - assert len(generated_data) == 2 * num_of_attributes - - for attr in range(num_of_attributes): - assert isinstance(generated_data[attr], tf.SparseTensor) - assert isinstance( - generated_data[attr + num_of_attributes], tf.SparseTensor - ) - - # Coords should be equal to batch for both x and y - assert generated_data[attr].indices.shape[0] <= BATCH_SIZE - - assert tuple(generated_data[attr + num_of_attributes].shape.dims) <= ( - BATCH_SIZE, - NUM_OF_CLASSES, - ) + # Test the generator twice: once with the public api (TensorflowTileDBDataset) + # and once with calling _generator directly. Although the former calls the + # latter internally, it is not reported as covered by the coverage report + # due to https://github.com/tensorflow/tensorflow/issues/33759 + generators = [ + iter( + TensorflowTileDBDataset( + x_attribute_names=attribute_names, + y_attribute_names=attribute_names, + buffer_size=buffer_size, + **kwargs + ) + ), + _generator( + x_attrs=attribute_names, + y_attrs=attribute_names, + buffer_size=buffer_size or BATCH_SIZE, + **kwargs + ), + ] + for generator in generators: + generated_data = next(generator) + assert len(generated_data) == 2 * num_of_attributes + + for attr in range(num_of_attributes): + assert isinstance(generated_data[attr], tf.SparseTensor) + assert isinstance( + generated_data[attr + num_of_attributes], tf.SparseTensor + ) + + # Coords should be equal to batch for both x and y + assert generated_data[attr].indices.shape[0] <= BATCH_SIZE + + assert tuple( + generated_data[attr + num_of_attributes].shape.dims + ) <= ( + BATCH_SIZE, + NUM_OF_CLASSES, + ) diff --git a/tiledb/ml/readers/tensorflow.py b/tiledb/ml/readers/tensorflow.py index 9d060414..163bb427 100644 --- a/tiledb/ml/readers/tensorflow.py +++ b/tiledb/ml/readers/tensorflow.py @@ -42,8 +42,8 @@ def TensorflowTileDBDataset( raise TypeError("Dense x_array and sparse y_array not currently supported") # Check that x_array and y_array have the same number of rows - rows: int = x_array.schema.domain.shape[0] - if rows != y_array.schema.domain.shape[0]: + rows: int = x_array.shape[0] + if rows != y_array.shape[0]: raise ValueError( "x_array and y_array should have the same number of rows, i.e. the " "first dimension of x_array and y_array should be of equal domain extent" @@ -70,7 +70,6 @@ def TensorflowTileDBDataset( y_array=y_array, x_attrs=x_attribute_names, y_attrs=y_attribute_names, - offsets=range(0, rows, batch_size), batch_size=batch_size, buffer_size=buffer_size, batch_shuffle=batch_shuffle, @@ -83,7 +82,7 @@ def TensorflowTileDBDataset( def _get_attr_names(array: tiledb.Array) -> Sequence[str]: - return tuple(array.schema.attr(idx).name for idx in range(array.schema.nattr)) + return tuple(array.attr(idx).name for idx in range(array.schema.nattr)) def _get_signature( @@ -93,10 +92,7 @@ def _get_signature( tf.SparseTensorSpec if isinstance(array, tiledb.SparseArray) else tf.TensorSpec ) return tuple( - cls( - shape=(None, *array.schema.domain.shape[1:]), - dtype=array.schema.attr(attr).dtype, - ) + cls(shape=(None, *array.shape[1:]), dtype=array.attr(attr).dtype) for attr in attrs ) @@ -106,7 +102,6 @@ def _generator( y_array: tiledb.Array, x_attrs: Sequence[str], y_attrs: Sequence[str], - offsets: range, batch_size: int, buffer_size: int, batch_shuffle: bool = False, @@ -115,7 +110,7 @@ def _generator( x_batch = TensorflowBatch(x_array.schema, x_attrs, batch_size) y_batch = TensorflowBatch(y_array.schema, y_attrs, batch_size) with ThreadPoolExecutor(max_workers=2) as executor: - for offset in offsets: + for offset in range(0, x_array.shape[0], buffer_size): x_buffer, y_buffer = executor.map( lambda array: array[offset : offset + buffer_size], # type: ignore (x_array, y_array),