Skip to content

Commit

Permalink
Test both the public TensorflowTileDBDataset and the private _generat…
Browse files Browse the repository at this point in the history
…or to work around nedbat/coveragepy#856
  • Loading branch information
gsakkis committed Feb 10, 2022
1 parent 5674e63 commit db10369
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 66 deletions.
51 changes: 34 additions & 17 deletions tests/test_tensorflow_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import tensorflow as tf

import tiledb
from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset
from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset, _generator

from .utils import ingest_in_tiledb

Expand Down Expand Up @@ -197,28 +197,45 @@ def test_dataset_generator_batch_output(
attribute_names = [
"features_" + str(attr) for attr in range(num_of_attributes)
]
dataset = TensorflowTileDBDataset(
kwargs = dict(
x_array=x,
y_array=y,
batch_size=BATCH_SIZE,
batch_shuffle=batch_shuffle,
buffer_size=buffer_size,
within_batch_shuffle=within_batch_shuffle,
x_attribute_names=attribute_names,
y_attribute_names=attribute_names,
)
generated_data = next(iter(dataset))
assert len(generated_data) == 2 * num_of_attributes

for attr in range(num_of_attributes):
assert tuple(generated_data[attr].shape) <= (
BATCH_SIZE,
*input_shape[1:],
)
assert tuple(generated_data[num_of_attributes + attr].shape) <= (
BATCH_SIZE,
NUM_OF_CLASSES,
)
# Test the generator twice: once with the public api (TensorflowTileDBDataset)
# and once with calling _generator directly. Although the former calls the
# latter internally, it is not reported as covered by the coverage report
# due to https://github.com/tensorflow/tensorflow/issues/33759
generators = [
iter(
TensorflowTileDBDataset(
x_attribute_names=attribute_names,
y_attribute_names=attribute_names,
buffer_size=buffer_size,
**kwargs
)
),
_generator(
x_attrs=attribute_names,
y_attrs=attribute_names,
buffer_size=buffer_size or BATCH_SIZE,
**kwargs
),
]
for generator in generators:
generated_data = next(generator)
assert len(generated_data) == 2 * num_of_attributes
for attr in range(num_of_attributes):
assert tuple(generated_data[attr].shape) <= (
BATCH_SIZE,
*input_shape[1:],
)
assert tuple(generated_data[num_of_attributes + attr].shape) <= (
BATCH_SIZE,
NUM_OF_CLASSES,
)

def test_buffer_size_geq_batch_size_exception(
self,
Expand Down
117 changes: 78 additions & 39 deletions tests/test_tensorflow_sparse_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import tensorflow as tf

import tiledb
from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset
from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset, _generator

from .utils import create_sparse_array_one_hot_2d, ingest_in_tiledb

Expand Down Expand Up @@ -380,28 +380,48 @@ def test_generator_sparse_x_dense_y_batch_output(
attribute_names = [
"features_" + str(attr) for attr in range(num_of_attributes)
]
dataset = TensorflowTileDBDataset(
kwargs = dict(
x_array=x,
y_array=y,
batch_size=BATCH_SIZE,
buffer_size=buffer_size,
batch_shuffle=batch_shuffle,
x_attribute_names=attribute_names,
y_attribute_names=attribute_names,
)
generated_data = next(iter(dataset))
assert len(generated_data) == 2 * num_of_attributes

for attr in range(num_of_attributes):
assert isinstance(generated_data[attr], tf.SparseTensor)
assert isinstance(generated_data[attr + num_of_attributes], tf.Tensor)

# Coords should be equal to batch for both x and y
assert generated_data[attr].indices.shape[0] <= BATCH_SIZE
assert tuple(generated_data[attr + num_of_attributes].shape) <= (
BATCH_SIZE,
NUM_OF_CLASSES,
)
# Test the generator twice: once with the public api (TensorflowTileDBDataset)
# and once with calling _generator directly. Although the former calls the
# latter internally, it is not reported as covered by the coverage report
# due to https://github.com/tensorflow/tensorflow/issues/33759
generators = [
iter(
TensorflowTileDBDataset(
x_attribute_names=attribute_names,
y_attribute_names=attribute_names,
buffer_size=buffer_size,
**kwargs
)
),
_generator(
x_attrs=attribute_names,
y_attrs=attribute_names,
buffer_size=buffer_size or BATCH_SIZE,
**kwargs
),
]
for generator in generators:
generated_data = next(generator)
assert len(generated_data) == 2 * num_of_attributes

for attr in range(num_of_attributes):
assert isinstance(generated_data[attr], tf.SparseTensor)
assert isinstance(
generated_data[attr + num_of_attributes], tf.Tensor
)

# Coords should be equal to batch for both x and y
assert generated_data[attr].indices.shape[0] <= BATCH_SIZE
assert tuple(generated_data[attr + num_of_attributes].shape) <= (
BATCH_SIZE,
NUM_OF_CLASSES,
)

def test_generator_sparse_x_sparse_y_batch_output(
self, tmpdir, input_shape, num_of_attributes, batch_shuffle, buffer_size
Expand Down Expand Up @@ -429,29 +449,48 @@ def test_generator_sparse_x_sparse_y_batch_output(
attribute_names = [
"features_" + str(attr) for attr in range(num_of_attributes)
]

dataset = TensorflowTileDBDataset(
kwargs = dict(
x_array=x,
y_array=y,
batch_size=BATCH_SIZE,
buffer_size=buffer_size,
batch_shuffle=batch_shuffle,
x_attribute_names=attribute_names,
y_attribute_names=attribute_names,
)
generated_data = next(iter(dataset))
assert len(generated_data) == 2 * num_of_attributes

for attr in range(num_of_attributes):
assert isinstance(generated_data[attr], tf.SparseTensor)
assert isinstance(
generated_data[attr + num_of_attributes], tf.SparseTensor
)

# Coords should be equal to batch for both x and y
assert generated_data[attr].indices.shape[0] <= BATCH_SIZE

assert tuple(generated_data[attr + num_of_attributes].shape.dims) <= (
BATCH_SIZE,
NUM_OF_CLASSES,
)
# Test the generator twice: once with the public api (TensorflowTileDBDataset)
# and once with calling _generator directly. Although the former calls the
# latter internally, it is not reported as covered by the coverage report
# due to https://github.com/tensorflow/tensorflow/issues/33759
generators = [
iter(
TensorflowTileDBDataset(
x_attribute_names=attribute_names,
y_attribute_names=attribute_names,
buffer_size=buffer_size,
**kwargs
)
),
_generator(
x_attrs=attribute_names,
y_attrs=attribute_names,
buffer_size=buffer_size or BATCH_SIZE,
**kwargs
),
]
for generator in generators:
generated_data = next(generator)
assert len(generated_data) == 2 * num_of_attributes

for attr in range(num_of_attributes):
assert isinstance(generated_data[attr], tf.SparseTensor)
assert isinstance(
generated_data[attr + num_of_attributes], tf.SparseTensor
)

# Coords should be equal to batch for both x and y
assert generated_data[attr].indices.shape[0] <= BATCH_SIZE

assert tuple(
generated_data[attr + num_of_attributes].shape.dims
) <= (
BATCH_SIZE,
NUM_OF_CLASSES,
)
15 changes: 5 additions & 10 deletions tiledb/ml/readers/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def TensorflowTileDBDataset(
raise TypeError("Dense x_array and sparse y_array not currently supported")

# Check that x_array and y_array have the same number of rows
rows: int = x_array.schema.domain.shape[0]
if rows != y_array.schema.domain.shape[0]:
rows: int = x_array.shape[0]
if rows != y_array.shape[0]:
raise ValueError(
"x_array and y_array should have the same number of rows, i.e. the "
"first dimension of x_array and y_array should be of equal domain extent"
Expand All @@ -70,7 +70,6 @@ def TensorflowTileDBDataset(
y_array=y_array,
x_attrs=x_attribute_names,
y_attrs=y_attribute_names,
offsets=range(0, rows, batch_size),
batch_size=batch_size,
buffer_size=buffer_size,
batch_shuffle=batch_shuffle,
Expand All @@ -81,7 +80,7 @@ def TensorflowTileDBDataset(


def _get_attr_names(array: tiledb.Array) -> Sequence[str]:
return tuple(array.schema.attr(idx).name for idx in range(array.schema.nattr))
return tuple(array.attr(idx).name for idx in range(array.schema.nattr))


def _get_signature(
Expand All @@ -91,10 +90,7 @@ def _get_signature(
tf.SparseTensorSpec if isinstance(array, tiledb.SparseArray) else tf.TensorSpec
)
return tuple(
cls(
shape=(None, *array.schema.domain.shape[1:]),
dtype=array.schema.attr(attr).dtype,
)
cls(shape=(None, *array.shape[1:]), dtype=array.attr(attr).dtype)
for attr in attrs
)

Expand All @@ -104,7 +100,6 @@ def _generator(
y_array: tiledb.Array,
x_attrs: Sequence[str],
y_attrs: Sequence[str],
offsets: range,
batch_size: int,
buffer_size: int,
batch_shuffle: bool = False,
Expand All @@ -113,7 +108,7 @@ def _generator(
x_batch = TensorflowBatch(x_array.schema, x_attrs, batch_size)
y_batch = TensorflowBatch(y_array.schema, y_attrs, batch_size)
with ThreadPoolExecutor(max_workers=2) as executor:
for offset in offsets:
for offset in range(0, x_array.shape[0], buffer_size):
x_buffer, y_buffer = executor.map(
lambda array: array[offset : offset + buffer_size], # type: ignore
(x_array, y_array),
Expand Down

0 comments on commit db10369

Please sign in to comment.