From a7893459026abc60bcba0c4a93dff933691a961d Mon Sep 17 00:00:00 2001 From: Dale Tovar Date: Thu, 11 Mar 2021 14:48:03 -0700 Subject: [PATCH 1/3] add dtypes --- sparse/_compressed/common.py | 8 ++++- sparse/_compressed/compressed.py | 55 ++++++++++++++++++++++++-------- sparse/_compressed/convert.py | 26 ++++++++------- sparse/_compressed/indexing.py | 18 +++++++---- sparse/_coo/common.py | 27 ++++++++++++++-- sparse/_coo/core.py | 55 ++++++++++++++++++++++---------- sparse/_utils.py | 25 ++++++++++++++- sparse/tests/test_coo.py | 7 ++++ 8 files changed, 168 insertions(+), 53 deletions(-) diff --git a/sparse/_compressed/common.py b/sparse/_compressed/common.py index 24589655..c3610387 100644 --- a/sparse/_compressed/common.py +++ b/sparse/_compressed/common.py @@ -1,5 +1,5 @@ import numpy as np -from .._utils import check_consistent_fill_value, normalize_axis +from .._utils import check_consistent_fill_value, normalize_axis, can_store def concatenate(arrays, axis=0, compressed_axes=None): @@ -41,6 +41,9 @@ def concatenate(arrays, axis=0, compressed_axes=None): data = np.concatenate([arr.data for arr in arrays]) ptr_len = arrays[0].indptr.shape[0] nnz = arrays[0].nnz + total_nnz = sum([arr.nnz for arr in arrays[:-1]]) + if not can_store(indptr.dtype, total_nnz): + indptr = indptr.astype(np.min_scalar_type(total_nnz)) for i in range(1, len(arrays)): indptr[ptr_len:] += nnz nnz = arrays[i].nnz @@ -93,6 +96,9 @@ def stack(arrays, axis=0, compressed_axes=None): data = np.concatenate([arr.data for arr in arrays]) ptr_len = arrays[0].indptr.shape[0] nnz = arrays[0].nnz + total_nnz = sum([arr.nnz for arr in arrays[:-1]]) + if not can_store(indptr.dtype, total_nnz): + indptr = indptr.astype(np.min_scalar_type(total_nnz)) for i in range(1, len(arrays)): indptr[ptr_len:] += nnz nnz = arrays[i].nnz diff --git a/sparse/_compressed/compressed.py b/sparse/_compressed/compressed.py index d50e9eda..bf92fe20 100644 --- a/sparse/_compressed/compressed.py +++ b/sparse/_compressed/compressed.py @@ -11,6 +11,7 @@ from .._common import dot, matmul from .._utils import ( normalize_axis, + can_store, check_zero_fill_value, check_compressed_axes, equivalent, @@ -20,7 +21,7 @@ from .indexing import getitem -def _from_coo(x, compressed_axes=None): +def _from_coo(x, compressed_axes=None, storage_dtype=None): if x.ndim == 0: if compressed_axes is not None: @@ -49,16 +50,28 @@ def _from_coo(x, compressed_axes=None): compressed_shape = (row_size, col_size) shape = x.shape + if storage_dtype and not can_store(storage_dtype, max(compressed_shape)): + raise ValueError( + "cannot store array with the compressed shape of {} with dtype {}.".format( + compressed_shape, storage_dtype + ) + ) + + if not storage_dtype: + storage_dtype = x.coords.dtype + if not can_store(storage_dtype, max(compressed_shape)): + storage_dtype = np.min_scalar_type(max(compressed_shape)) + # transpose axes, linearize, reshape, and compress linear = linear_loc(x.coords[axis_order], reordered_shape) order = np.argsort(linear) linear = linear[order] - coords = np.empty((2, x.nnz), dtype=np.intp) + coords = np.empty((2, x.nnz), dtype=storage_dtype) strides = 1 for i, d in enumerate(compressed_shape[::-1]): coords[-(i + 1), :] = (linear // strides) % d strides *= d - indptr = np.empty(row_size + 1, dtype=np.intp) + indptr = np.empty(row_size + 1, dtype=storage_dtype) indptr[0] = 0 np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:]) indices = coords[1] @@ -112,7 +125,13 @@ class GCXS(SparseArray, NDArrayOperatorsMixin): __array_priority__ = 12 def __init__( - self, arg, shape=None, compressed_axes=None, prune=False, fill_value=0 + self, + arg, + shape=None, + compressed_axes=None, + prune=False, + fill_value=0, + storage_dtype=None, ): if isinstance(arg, np.ndarray): @@ -121,7 +140,9 @@ def __init__( ) elif isinstance(arg, COO): - (arg, shape, compressed_axes, fill_value) = _from_coo(arg, compressed_axes) + (arg, shape, compressed_axes, fill_value) = _from_coo( + arg, compressed_axes, storage_dtype + ) if shape is None: raise ValueError("missing `shape` argument") @@ -157,13 +178,15 @@ def copy(self, deep=True): return _copy.deepcopy(self) if deep else _copy.copy(self) @classmethod - def from_numpy(cls, x, compressed_axes=None, fill_value=0): - coo = COO(x, fill_value=fill_value) - return cls.from_coo(coo, compressed_axes) + def from_numpy(cls, x, compressed_axes=None, fill_value=0, storage_dtype=None): + coo = COO(x, fill_value=fill_value, storage_dtype=storage_dtype) + return cls.from_coo(coo, compressed_axes, storage_dtype) @classmethod - def from_coo(cls, x, compressed_axes=None): - (arg, shape, compressed_axes, fill_value) = _from_coo(x, compressed_axes) + def from_coo(cls, x, compressed_axes=None, storage_dtype=None): + (arg, shape, compressed_axes, fill_value) = _from_coo( + x, compressed_axes, storage_dtype + ) return cls( arg, shape=shape, compressed_axes=compressed_axes, fill_value=fill_value ) @@ -181,9 +204,13 @@ def from_scipy_sparse(cls, x): ) @classmethod - def from_iter(cls, x, shape=None, compressed_axes=None, fill_value=None): + def from_iter( + cls, x, shape=None, compressed_axes=None, fill_value=None, storage_dtype=None + ): return cls.from_coo( - COO.from_iter(x, shape, fill_value), compressed_axes=compressed_axes + COO.from_iter(x, shape, fill_value), + compressed_axes, + storage_dtype, ) @property @@ -289,7 +316,7 @@ def _reduce_calc(self, method, axis, keepdims=False, **kwargs): x = self.change_compressed_axes(compressed_axes) idx = np.diff(x.indptr) != 0 indptr = x.indptr[:-1][idx] - indices = (np.arange(x._compressed_shape[0], dtype=np.intp))[idx] + indices = (np.arange(x._compressed_shape[0], dtype=self.indptr.dtype))[idx] data = method.reduceat(x.data, indptr, **kwargs) counts = x.indptr[1:][idx] - x.indptr[:-1][idx] arr_attrs = (x, compressed_axes, indices) @@ -782,7 +809,7 @@ def _prune(self): coords = coords[:, mask] self.indices = coords[1] row_size = self._compressed_shape[0] - indptr = np.empty(row_size + 1, dtype=np.intp) + indptr = np.empty(row_size + 1, dtype=self.indptr.dtype) indptr[0] = 0 np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:]) self.indptr = indptr diff --git a/sparse/_compressed/convert.py b/sparse/_compressed/convert.py index a9e399fe..cbf99748 100644 --- a/sparse/_compressed/convert.py +++ b/sparse/_compressed/convert.py @@ -1,13 +1,13 @@ import numpy as np import numba import operator -from .._utils import check_compressed_axes +from .._utils import check_compressed_axes, get_out_dtype from .._coo.common import linear_loc from functools import reduce from numba.typed import List -def convert_to_flat(inds, shape): +def convert_to_flat(inds, shape, dtype): """ Converts the indices of either the compressed or uncompressed axes into a linearized form. Prepares the inputs for compute_flat. @@ -15,11 +15,11 @@ def convert_to_flat(inds, shape): inds = [np.array(ind) for ind in inds] if any(ind.ndim > 1 for ind in inds): raise IndexError("Only one-dimensional iterable indices supported.") - cols = np.empty(np.prod([ind.size for ind in inds]), dtype=np.intp) + cols = np.empty(np.prod([ind.size for ind in inds]), dtype=dtype) shape_bins = transform_shape(np.asarray(shape)) increments = List() for i in range(len(inds)): - increments.append((inds[i] * shape_bins[i]).astype(np.int32)) + increments.append((inds[i] * shape_bins[i]).astype(dtype)) operations = np.prod([ind.shape[0] for ind in increments[:-1]]) return compute_flat(increments, cols, operations) @@ -67,7 +67,7 @@ def transform_shape(shape): # pragma: no cover @numba.jit(nopython=True, nogil=True) def uncompress_dimension(indptr): # pragma: no cover """converts an index pointer array into an array of coordinates""" - uncompressed = np.empty(indptr[-1], dtype=np.intp) + uncompressed = np.empty(indptr[-1], dtype=indptr.dtype) for i in range(len(indptr) - 1): uncompressed[indptr[i] : indptr[i + 1]] = i return uncompressed @@ -123,7 +123,8 @@ def _1d_reshape(x, shape, compressed_axes): x_indices = x.indices[:end_idx] new_nnz = x_indices.size new_linear = np.empty(new_nnz, dtype=np.intp) - new_coords = np.empty((2, new_nnz), dtype=np.intp) + coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape)) + new_coords = np.empty((2, new_nnz), dtype=coords_dtype) _linearize( x_indices, @@ -137,7 +138,7 @@ def _1d_reshape(x, shape, compressed_axes): order = np.argsort(new_linear) new_coords = new_coords[:, order] - indptr = np.empty(row_size + 1, dtype=np.intp) + indptr = np.empty(row_size + 1, dtype=coords_dtype) indptr[0] = 0 np.cumsum(np.bincount(new_coords[0], minlength=row_size), out=indptr[1:]) indices = new_coords[1] @@ -162,7 +163,8 @@ def _resize(x, shape, compressed_axes): coords = np.stack((uncompressed, x.indices)) linear = linear_loc(coords, x._compressed_shape) sorted_axis_order = np.argsort(x._axis_order) - c_linear = np.empty(x.nnz, dtype=np.intp) + linear_dtype = get_out_dtype(x.indices, np.prod(shape)) + c_linear = np.empty(x.nnz, dtype=linear_dtype) _c_ordering( linear, @@ -203,7 +205,8 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False): linear = linear_loc(coords, x._compressed_shape) sorted_axis_order = np.argsort(x._axis_order) if len(shape) == 1: - c_linear = np.empty(x.nnz, dtype=np.intp) + dtype = get_out_dtype(x.indices, shape[0]) + c_linear = np.empty(x.nnz, dtype=dtype) _c_ordering( linear, c_linear, @@ -220,11 +223,12 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False): new_axis_order.extend(np.setdiff1d(np.arange(len(shape)), compressed_axes)) new_linear = np.empty(x.nnz, dtype=np.intp) new_reordered_shape = np.array(shape)[new_axis_order] - new_coords = np.empty((2, x.nnz), dtype=np.intp) axisptr = len(compressed_axes) row_size = np.prod(new_reordered_shape[:axisptr]) col_size = np.prod(new_reordered_shape[axisptr:]) new_compressed_shape = np.array((row_size, col_size)) + coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape)) + new_coords = np.empty((2, x.nnz), dtype=coords_dtype) _convert_coords( linear, @@ -247,7 +251,7 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False): indptr = [] indices = coords[0, :] else: - indptr = np.empty(row_size + 1, dtype=np.intp) + indptr = np.empty(row_size + 1, dtype=coords_dtype) indptr[0] = 0 np.cumsum(np.bincount(new_coords[0], minlength=row_size), out=indptr[1:]) indices = new_coords[1] diff --git a/sparse/_compressed/indexing.py b/sparse/_compressed/indexing.py index 378d71c4..3af09509 100644 --- a/sparse/_compressed/indexing.py +++ b/sparse/_compressed/indexing.py @@ -89,13 +89,17 @@ def getitem(x, key): # convert all indices of compressed axes to a single array index # this tells us which 'rows' of the underlying csr matrix to iterate through rows = convert_to_flat( - reordered_key[: x._axisptr], x._reordered_shape[: x._axisptr] + reordered_key[: x._axisptr], + x._reordered_shape[: x._axisptr], + x.indices.dtype, ) # convert all indices of uncompressed axes to a single array index # this tells us which 'columns' of the underlying csr matrix to iterate through cols = convert_to_flat( - reordered_key[x._axisptr :], x._reordered_shape[x._axisptr :] + reordered_key[x._axisptr :], + x._reordered_shape[x._axisptr :], + x.indices.dtype, ) starts = x.indptr[:-1][rows] # find the start and end of each of the rows @@ -117,7 +121,7 @@ def getitem(x, key): compressed_axes = (0,) # defaults to 0 row_size = starts.size - indptr = np.empty(row_size + 1, dtype=np.intp) + indptr = np.empty(row_size + 1, dtype=x.indptr.dtype) indptr[0] = 0 if pos_slice: arg = get_slicing_selection(x.data, x.indices, indptr, starts, ends, cols) @@ -134,7 +138,7 @@ def getitem(x, key): indptr = None else: indices = uncompressed % size - indptr = np.empty(shape[0] + 1, dtype=np.intp) + indptr = np.empty(shape[0] + 1, dtype=x.indptr.dtype) indptr[0] = 0 np.cumsum( np.bincount(uncompressed // size, minlength=shape[0]), out=indptr[1:] @@ -144,7 +148,7 @@ def getitem(x, key): indptr = None else: uncompressed = indices // size - indptr = np.empty(shape[0] + 1, dtype=np.intp) + indptr = np.empty(shape[0] + 1, dtype=x.indptr.dtype) indptr[0] = 0 np.cumsum(np.bincount(uncompressed, minlength=shape[0]), out=indptr[1:]) indices = indices % size @@ -230,7 +234,7 @@ def get_slicing_selection( ind_list.extend(inds) indptr[i + 1] = indptr[i] + len(inds) ind_list = np.array(ind_list, dtype=np.int64) - indices = np.array(indices) + indices = np.array(indices, dtype=indptr.dtype) data = arr_data[ind_list] return (data, indices, indptr) @@ -261,7 +265,7 @@ def get_array_selection( ind_list.extend(inds) indptr[i + 1] = indptr[i] + len(inds) ind_list = np.array(ind_list, dtype=np.int64) - indices = np.array(indices) + indices = np.array(indices, dtype=indptr.dtype) data = arr_data[ind_list] return (data, indices, indptr) diff --git a/sparse/_coo/common.py b/sparse/_coo/common.py index b9162051..97da19d6 100644 --- a/sparse/_coo/common.py +++ b/sparse/_coo/common.py @@ -10,9 +10,11 @@ from .._sparse_array import SparseArray from .._utils import ( isscalar, + is_unsigned_dtype, normalize_axis, check_zero_fill_value, check_consistent_fill_value, + can_store, ) @@ -173,6 +175,8 @@ def concatenate(arrays, axis=0): data = np.concatenate([x.data for x in arrays]) coords = np.concatenate([x.coords for x in arrays], axis=1) + if not can_store(coords.dtype, max(shape)): + coords = coords.astype(np.min_scalar_type(max(shape))) dim = 0 for x in arrays: if dim: @@ -688,6 +692,7 @@ def roll(a, shift, axis=None): Output array, with the same shape as a. """ from .core import COO, as_coo + from numpy.core._exceptions import UFuncTypeError a = as_coo(a) @@ -719,11 +724,27 @@ def roll(a, shift, axis=None): "If 'shift' is a 1D sequence, " "'axis' must have equal length." ) + if not can_store(a.coords.dtype, max(a.shape + shift)): + raise ValueError( + "cannot roll with coords.dtype {} and shift {}. Try casting coords to a larger dtype.".format( + a.coords.dtype, + shift, + ) + ) + # shift elements coords, data = np.copy(a.coords), np.copy(a.data) - for sh, ax in zip(shift, axis): - coords[ax] += sh - coords[ax] %= a.shape[ax] + try: + for sh, ax in zip(shift, axis): + coords[ax] += sh + coords[ax] %= a.shape[ax] + except UFuncTypeError: + if is_unsigned_dtype(coords.dtype): + raise ValueError( + "rolling with coords.dtype as {} is not safe. Try using a signed dtype.".format( + coords.dtype + ) + ) return COO( coords, diff --git a/sparse/_coo/core.py b/sparse/_coo/core.py index 05267bc6..975d0cee 100644 --- a/sparse/_coo/core.py +++ b/sparse/_coo/core.py @@ -14,7 +14,13 @@ from .indexing import getitem from .._umath import elemwise, broadcast_to from .._sparse_array import SparseArray, _reduce_super_ufunc -from .._utils import normalize_axis, equivalent, check_zero_fill_value, _zero_of_dtype +from .._utils import ( + normalize_axis, + equivalent, + check_zero_fill_value, + _zero_of_dtype, + can_store, +) class COO(SparseArray, NDArrayOperatorsMixin): # lgtm [py/missing-equals] @@ -205,13 +211,16 @@ def __init__( prune=False, cache=False, fill_value=None, + storage_dtype=None, ): self._cache = None if cache: self.enable_caching() if data is None: - arr = as_coo(coords, shape=shape, fill_value=fill_value) + arr = as_coo( + coords, shape=shape, fill_value=fill_value, storage_dtype=storage_dtype + ) self._make_shallow_copy_of(arr) if cache: self.enable_caching() @@ -234,7 +243,7 @@ def __init__( if shape and not self.coords.size: self.coords = np.zeros( - (len(shape) if isinstance(shape, Iterable) else 1, 0), dtype=np.uint64 + (len(shape) if isinstance(shape, Iterable) else 1, 0), dtype=np.intp ) if shape is None: @@ -243,8 +252,18 @@ def __init__( else: shape = () + if not isinstance(shape, Iterable): + shape = (shape,) + super().__init__(shape, fill_value=fill_value) - self.coords = self.coords.astype(np.intp, copy=False) + if storage_dtype: + if not can_store(storage_dtype, max(shape)): + raise ValueError( + "cannot cast array with shape {} to dtype {}.".format( + shape, storage_dtype + ) + ) + self.coords = self.coords.astype(storage_dtype) if self.shape: if len(self.data) != self.coords.shape[1]: @@ -332,7 +351,7 @@ def enable_caching(self): self._cache = defaultdict(lambda: deque(maxlen=3)) @classmethod - def from_numpy(cls, x, fill_value=None): + def from_numpy(cls, x, fill_value=None, storage_dtype=None): """ Convert the given :obj:`numpy.ndarray` to a :obj:`COO` object. @@ -379,6 +398,7 @@ def from_numpy(cls, x, fill_value=None): has_duplicates=False, sorted=True, fill_value=fill_value, + storage_dtype=storage_dtype, ) def todense(self): @@ -618,7 +638,7 @@ def nbytes(self): >>> coords = np.random.randint(1000, size=(3, 6), dtype=np.uint16) >>> s = COO(coords, data, shape=(1000, 1000, 1000)) >>> s.nbytes - 150 + 42 """ return self.data.nbytes + self.coords.nbytes @@ -997,9 +1017,6 @@ def reshape(self, shape, order="C"): extra = int(self.size / np.prod([d for d in shape if d != -1])) shape = tuple([d if d != -1 else extra for d in shape]) - if self.shape == shape: - return self - if self.size != reduce(operator.mul, shape, 1): raise ValueError( "cannot reshape array of size {} into shape {}".format(self.size, shape) @@ -1013,7 +1030,10 @@ def reshape(self, shape, order="C"): # TODO: this self.size enforces a 2**64 limit to array size linear_loc = self.linear_loc() - coords = np.empty((len(shape), self.nnz), dtype=np.intp) + storage_dtype = self.coords.dtype + if shape != () and not can_store(storage_dtype, max(shape)): + storage_dtype = np.min_scalar_type(max(shape)) + coords = np.empty((len(shape), self.nnz), dtype=storage_dtype) strides = 1 for i, d in enumerate(shape[::-1]): coords[-(i + 1), :] = (linear_loc // strides) % d @@ -1033,7 +1053,7 @@ def reshape(self, shape, order="C"): self._cache["reshape"].append((shape, result)) return result - def resize(self, *args, refcheck=True): + def resize(self, *args, refcheck=True, coords_dtype=np.intp): """ This method changes the shape and size of an array in-place. Parameters @@ -1063,7 +1083,10 @@ def resize(self, *args, refcheck=True): end_idx = np.searchsorted(linear_loc, new_size, side="left") linear_loc = linear_loc[:end_idx] - coords = np.empty((len(shape), len(linear_loc)), dtype=np.intp) + storage_dtype = self.coords.dtype + if shape != () and not can_store(storage_dtype, max(shape)): + storage_dtype = np.min_scalar_type(max(shape)) + coords = np.empty((len(shape), len(linear_loc)), dtype=storage_dtype) strides = 1 for i, d in enumerate(shape[::-1]): coords[-(i + 1), :] = (linear_loc // strides) % d @@ -1218,7 +1241,7 @@ def _sort_indices(self): >>> s = COO(coords, data) >>> s._sort_indices() >>> s.coords # doctest: +NORMALIZE_WHITESPACE - array([[0, 1, 2]]) + array([[0, 1, 2]], dtype=uint8) >>> s.data # doctest: +NORMALIZE_WHITESPACE array([3, 4, 1], dtype=uint8) """ @@ -1246,7 +1269,7 @@ def _sum_duplicates(self): >>> s = COO(coords, data) >>> s._sum_duplicates() >>> s.coords # doctest: +NORMALIZE_WHITESPACE - array([[0, 1, 2]]) + array([[0, 1, 2]], dtype=uint8) >>> s.data # doctest: +NORMALIZE_WHITESPACE array([6, 7, 2], dtype=uint8) """ @@ -1428,7 +1451,7 @@ def asformat(self, format, compressed_axes=None): raise NotImplementedError("The given format is not supported.") -def as_coo(x, shape=None, fill_value=None): +def as_coo(x, shape=None, fill_value=None, storage_dtype=None): """ Converts any given format to :obj:`COO`. See the "See Also" section for details. @@ -1467,7 +1490,7 @@ def as_coo(x, shape=None, fill_value=None): return x.asformat("coo") if isinstance(x, np.ndarray): - return COO.from_numpy(x, fill_value=fill_value) + return COO.from_numpy(x, fill_value=fill_value, storage_dtype=storage_dtype) if isinstance(x, scipy.sparse.spmatrix): return COO.from_scipy_sparse(x) diff --git a/sparse/_utils.py b/sparse/_utils.py index e4e45b5e..a755ecf3 100644 --- a/sparse/_utils.py +++ b/sparse/_utils.py @@ -85,6 +85,7 @@ def random( format="coo", compressed_axes=None, fill_value=None, + storage_dtype=None, ): """Generate a random sparse multidimensional array @@ -190,7 +191,13 @@ def random( data = data_rvs(nnz) - ar = COO(ind[None, :], data, shape=elements, fill_value=fill_value).reshape(shape) + ar = COO( + ind[None, :], + data, + shape=elements, + fill_value=fill_value, + storage_dtype=storage_dtype, + ).reshape(shape) return ar.asformat(format, compressed_axes=compressed_axes) @@ -461,3 +468,19 @@ def check_consistent_fill_value(arrays): "is different from a fill_value of {!s} in the first " "argument.".format(i, arg.fill_value, fv) ) + + +def get_out_dtype(arr, scalar): + out_type = arr.dtype + if not can_store(out_type, scalar): + out_type = np.min_scalar_type(scalar) + return out_type + + +def can_store(dtype, scalar): + # return dtype(scalar) == scalar + return np.array(scalar, dtype=dtype) == np.array(scalar) + + +def is_unsigned_dtype(dtype): + return not np.array(-1, dtype=dtype) == np.array(-1) diff --git a/sparse/tests/test_coo.py b/sparse/tests/test_coo.py index b140748a..eea53295 100644 --- a/sparse/tests/test_coo.py +++ b/sparse/tests/test_coo.py @@ -1195,6 +1195,13 @@ def test_valerr(self, args): with pytest.raises(ValueError): sparse.roll(x, *args) + @pytest.mark.parametrize("dtype", [np.uint8, np.int8]) + @pytest.mark.parametrize("shift", [300, -300]) + def test_dtype_errors(self, dtype, shift): + x = sparse.random((5, 5, 5), density=0.2, storage_dtype=dtype) + with pytest.raises(ValueError): + sparse.roll(x, shift) + def test_clip(): x = np.array([[0, 0, 1, 0, 2], [5, 0, 0, 3, 0]]) From 293a7fefe769e518baa31f64f840f2e4f49541ed Mon Sep 17 00:00:00 2001 From: Dale Tovar Date: Fri, 12 Mar 2021 19:43:56 -0700 Subject: [PATCH 2/3] add coverage --- sparse/_compressed/common.py | 4 ++-- sparse/_utils.py | 12 ++++++++++-- sparse/tests/test_compressed.py | 22 ++++++++++++++++++++++ sparse/tests/test_coo.py | 22 ++++++++++++++++++++++ 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/sparse/_compressed/common.py b/sparse/_compressed/common.py index c3610387..d24a8b66 100644 --- a/sparse/_compressed/common.py +++ b/sparse/_compressed/common.py @@ -41,7 +41,7 @@ def concatenate(arrays, axis=0, compressed_axes=None): data = np.concatenate([arr.data for arr in arrays]) ptr_len = arrays[0].indptr.shape[0] nnz = arrays[0].nnz - total_nnz = sum([arr.nnz for arr in arrays[:-1]]) + total_nnz = sum(int(arr.nnz) for arr in arrays) if not can_store(indptr.dtype, total_nnz): indptr = indptr.astype(np.min_scalar_type(total_nnz)) for i in range(1, len(arrays)): @@ -96,7 +96,7 @@ def stack(arrays, axis=0, compressed_axes=None): data = np.concatenate([arr.data for arr in arrays]) ptr_len = arrays[0].indptr.shape[0] nnz = arrays[0].nnz - total_nnz = sum([arr.nnz for arr in arrays[:-1]]) + total_nnz = sum(int(arr.nnz) for arr in arrays) if not can_store(indptr.dtype, total_nnz): indptr = indptr.astype(np.min_scalar_type(total_nnz)) for i in range(1, len(arrays)): diff --git a/sparse/_utils.py b/sparse/_utils.py index a755ecf3..24c1f89d 100644 --- a/sparse/_utils.py +++ b/sparse/_utils.py @@ -196,9 +196,18 @@ def random( data, shape=elements, fill_value=fill_value, - storage_dtype=storage_dtype, ).reshape(shape) + if storage_dtype: + if can_store(storage_dtype, max(shape)): + ar.coords = ar.coords.astype(storage_dtype) + else: + raise ValueError( + "cannot cast array with shape {} to dtype {}.".format( + shape, storage_dtype + ) + ) + return ar.asformat(format, compressed_axes=compressed_axes) @@ -478,7 +487,6 @@ def get_out_dtype(arr, scalar): def can_store(dtype, scalar): - # return dtype(scalar) == scalar return np.array(scalar, dtype=dtype) == np.array(scalar) diff --git a/sparse/tests/test_compressed.py b/sparse/tests/test_compressed.py index 0069d4b7..72a57053 100644 --- a/sparse/tests/test_compressed.py +++ b/sparse/tests/test_compressed.py @@ -430,3 +430,25 @@ def test_flatten(in_shape): e = x.flatten() assert_eq(e, a) + + +def test_gcxs_valerr(): + a = np.arange(300) + with pytest.raises(ValueError): + GCXS.from_numpy(a, storage_dtype=np.int8) + + +def test_upcast(): + a = sparse.random((50, 50, 50), density=0.1, format="coo", storage_dtype=np.uint8) + b = a.asformat("gcxs") + assert b.indices.dtype == np.uint16 + + a = sparse.random((8, 7, 6), density=0.5, format="gcxs", storage_dtype=np.uint8) + assert sparse.concatenate((a, a)).indptr.dtype == np.uint16 + assert sparse.stack((a, a)).indptr.dtype == np.uint16 + + +def test_from_coo(): + a = sparse.random((5, 5, 5), density=0.1, format="coo") + b = GCXS(a) + assert_eq(a, b) diff --git a/sparse/tests/test_coo.py b/sparse/tests/test_coo.py index eea53295..342250d2 100644 --- a/sparse/tests/test_coo.py +++ b/sparse/tests/test_coo.py @@ -242,6 +242,12 @@ def test_resize(a, b): assert_eq(x, s) +def test_resize_upcast(): + s = sparse.random((10, 10, 10), density=0.5, format="coo", storage_dtype=np.uint8) + s.resize(600) + assert s.coords.dtype == np.uint16 + + @pytest.mark.parametrize("axis1", [-3, -2, -1, 0, 1, 2]) @pytest.mark.parametrize("axis2", [-3, -2, -1, 0, 1, 2]) def test_swapaxes(axis1, axis2): @@ -371,6 +377,11 @@ def test_reshape_function(): assert_eq(s2, x.reshape(shape)) +def test_reshape_upcast(): + a = sparse.random((10, 10, 10), density=0.5, format="coo", storage_dtype=np.uint8) + assert a.reshape(1000).coords.dtype == np.uint16 + + def test_to_scipy_sparse(): s = sparse.random((3, 5), density=0.5) a = s.to_scipy_sparse() @@ -1202,6 +1213,11 @@ def test_dtype_errors(self, dtype, shift): with pytest.raises(ValueError): sparse.roll(x, shift) + def test_unsigned_type_error(self): + x = sparse.random((5, 5, 5), density=0.3, storage_dtype=np.uint8) + with pytest.raises(ValueError): + sparse.roll(x, -1) + def test_clip(): x = np.array([[0, 0, 1, 0, 2], [5, 0, 0, 3, 0]]) @@ -1579,3 +1595,9 @@ def test_astype_no_copy(): s1 = sparse.random((2, 3, 4), density=0.5) s2 = s1.astype(s1.dtype, copy=False) assert s1 is s2 + + +def test_coo_valerr(): + a = np.arange(300) + with pytest.raises(ValueError): + COO.from_numpy(a, storage_dtype=np.int8) From 5a7bef16d2d5005d02ed28eba437bf71e9675f16 Mon Sep 17 00:00:00 2001 From: Dale Tovar Date: Tue, 16 Mar 2021 16:53:23 -0600 Subject: [PATCH 3/3] add coverage --- sparse/_compressed/compressed.py | 40 +++++++++++++++++--------------- sparse/_compressed/convert.py | 4 ++-- sparse/_coo/core.py | 36 ++++++++++++++-------------- sparse/_utils.py | 12 ++++------ sparse/tests/test_compressed.py | 15 ++++++++---- sparse/tests/test_coo.py | 15 ++++++++---- 6 files changed, 67 insertions(+), 55 deletions(-) diff --git a/sparse/_compressed/compressed.py b/sparse/_compressed/compressed.py index bf92fe20..fd24b30e 100644 --- a/sparse/_compressed/compressed.py +++ b/sparse/_compressed/compressed.py @@ -21,7 +21,7 @@ from .indexing import getitem -def _from_coo(x, compressed_axes=None, storage_dtype=None): +def _from_coo(x, compressed_axes=None, idx_dtype=None): if x.ndim == 0: if compressed_axes is not None: @@ -50,28 +50,30 @@ def _from_coo(x, compressed_axes=None, storage_dtype=None): compressed_shape = (row_size, col_size) shape = x.shape - if storage_dtype and not can_store(storage_dtype, max(compressed_shape)): + if idx_dtype and not can_store(idx_dtype, max(max(compressed_shape), x.nnz)): raise ValueError( - "cannot store array with the compressed shape of {} with dtype {}.".format( - compressed_shape, storage_dtype + "cannot store array with the compressed shape {} and nnz {} with dtype {}.".format( + compressed_shape, + x.nnz, + idx_dtype, ) ) - if not storage_dtype: - storage_dtype = x.coords.dtype - if not can_store(storage_dtype, max(compressed_shape)): - storage_dtype = np.min_scalar_type(max(compressed_shape)) + if not idx_dtype: + idx_dtype = x.coords.dtype + if not can_store(idx_dtype, max(max(compressed_shape), x.nnz)): + idx_dtype = np.min_scalar_type(max(max(compressed_shape), x.nnz)) # transpose axes, linearize, reshape, and compress linear = linear_loc(x.coords[axis_order], reordered_shape) order = np.argsort(linear) linear = linear[order] - coords = np.empty((2, x.nnz), dtype=storage_dtype) + coords = np.empty((2, x.nnz), dtype=idx_dtype) strides = 1 for i, d in enumerate(compressed_shape[::-1]): coords[-(i + 1), :] = (linear // strides) % d strides *= d - indptr = np.empty(row_size + 1, dtype=storage_dtype) + indptr = np.empty(row_size + 1, dtype=idx_dtype) indptr[0] = 0 np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:]) indices = coords[1] @@ -131,7 +133,7 @@ def __init__( compressed_axes=None, prune=False, fill_value=0, - storage_dtype=None, + idx_dtype=None, ): if isinstance(arg, np.ndarray): @@ -141,7 +143,7 @@ def __init__( elif isinstance(arg, COO): (arg, shape, compressed_axes, fill_value) = _from_coo( - arg, compressed_axes, storage_dtype + arg, compressed_axes, idx_dtype ) if shape is None: @@ -178,14 +180,14 @@ def copy(self, deep=True): return _copy.deepcopy(self) if deep else _copy.copy(self) @classmethod - def from_numpy(cls, x, compressed_axes=None, fill_value=0, storage_dtype=None): - coo = COO(x, fill_value=fill_value, storage_dtype=storage_dtype) - return cls.from_coo(coo, compressed_axes, storage_dtype) + def from_numpy(cls, x, compressed_axes=None, fill_value=0, idx_dtype=None): + coo = COO(x, fill_value=fill_value, idx_dtype=idx_dtype) + return cls.from_coo(coo, compressed_axes, idx_dtype) @classmethod - def from_coo(cls, x, compressed_axes=None, storage_dtype=None): + def from_coo(cls, x, compressed_axes=None, idx_dtype=None): (arg, shape, compressed_axes, fill_value) = _from_coo( - x, compressed_axes, storage_dtype + x, compressed_axes, idx_dtype ) return cls( arg, shape=shape, compressed_axes=compressed_axes, fill_value=fill_value @@ -205,12 +207,12 @@ def from_scipy_sparse(cls, x): @classmethod def from_iter( - cls, x, shape=None, compressed_axes=None, fill_value=None, storage_dtype=None + cls, x, shape=None, compressed_axes=None, fill_value=None, idx_dtype=None ): return cls.from_coo( COO.from_iter(x, shape, fill_value), compressed_axes, - storage_dtype, + idx_dtype, ) @property diff --git a/sparse/_compressed/convert.py b/sparse/_compressed/convert.py index cbf99748..9fb772d0 100644 --- a/sparse/_compressed/convert.py +++ b/sparse/_compressed/convert.py @@ -123,7 +123,7 @@ def _1d_reshape(x, shape, compressed_axes): x_indices = x.indices[:end_idx] new_nnz = x_indices.size new_linear = np.empty(new_nnz, dtype=np.intp) - coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape)) + coords_dtype = get_out_dtype(x.indices, max(max(new_compressed_shape), x.nnz)) new_coords = np.empty((2, new_nnz), dtype=coords_dtype) _linearize( @@ -227,7 +227,7 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False): row_size = np.prod(new_reordered_shape[:axisptr]) col_size = np.prod(new_reordered_shape[axisptr:]) new_compressed_shape = np.array((row_size, col_size)) - coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape)) + coords_dtype = get_out_dtype(x.indices, max(max(new_compressed_shape), x.nnz)) new_coords = np.empty((2, x.nnz), dtype=coords_dtype) _convert_coords( diff --git a/sparse/_coo/core.py b/sparse/_coo/core.py index 975d0cee..b8d65955 100644 --- a/sparse/_coo/core.py +++ b/sparse/_coo/core.py @@ -211,7 +211,7 @@ def __init__( prune=False, cache=False, fill_value=None, - storage_dtype=None, + idx_dtype=None, ): self._cache = None if cache: @@ -219,7 +219,7 @@ def __init__( if data is None: arr = as_coo( - coords, shape=shape, fill_value=fill_value, storage_dtype=storage_dtype + coords, shape=shape, fill_value=fill_value, idx_dtype=idx_dtype ) self._make_shallow_copy_of(arr) if cache: @@ -256,14 +256,14 @@ def __init__( shape = (shape,) super().__init__(shape, fill_value=fill_value) - if storage_dtype: - if not can_store(storage_dtype, max(shape)): + if idx_dtype: + if not can_store(idx_dtype, max(shape)): raise ValueError( "cannot cast array with shape {} to dtype {}.".format( - shape, storage_dtype + shape, idx_dtype ) ) - self.coords = self.coords.astype(storage_dtype) + self.coords = self.coords.astype(idx_dtype) if self.shape: if len(self.data) != self.coords.shape[1]: @@ -351,7 +351,7 @@ def enable_caching(self): self._cache = defaultdict(lambda: deque(maxlen=3)) @classmethod - def from_numpy(cls, x, fill_value=None, storage_dtype=None): + def from_numpy(cls, x, fill_value=None, idx_dtype=None): """ Convert the given :obj:`numpy.ndarray` to a :obj:`COO` object. @@ -398,7 +398,7 @@ def from_numpy(cls, x, fill_value=None, storage_dtype=None): has_duplicates=False, sorted=True, fill_value=fill_value, - storage_dtype=storage_dtype, + idx_dtype=idx_dtype, ) def todense(self): @@ -1030,10 +1030,10 @@ def reshape(self, shape, order="C"): # TODO: this self.size enforces a 2**64 limit to array size linear_loc = self.linear_loc() - storage_dtype = self.coords.dtype - if shape != () and not can_store(storage_dtype, max(shape)): - storage_dtype = np.min_scalar_type(max(shape)) - coords = np.empty((len(shape), self.nnz), dtype=storage_dtype) + idx_dtype = self.coords.dtype + if shape != () and not can_store(idx_dtype, max(shape)): + idx_dtype = np.min_scalar_type(max(shape)) + coords = np.empty((len(shape), self.nnz), dtype=idx_dtype) strides = 1 for i, d in enumerate(shape[::-1]): coords[-(i + 1), :] = (linear_loc // strides) % d @@ -1083,10 +1083,10 @@ def resize(self, *args, refcheck=True, coords_dtype=np.intp): end_idx = np.searchsorted(linear_loc, new_size, side="left") linear_loc = linear_loc[:end_idx] - storage_dtype = self.coords.dtype - if shape != () and not can_store(storage_dtype, max(shape)): - storage_dtype = np.min_scalar_type(max(shape)) - coords = np.empty((len(shape), len(linear_loc)), dtype=storage_dtype) + idx_dtype = self.coords.dtype + if shape != () and not can_store(idx_dtype, max(shape)): + idx_dtype = np.min_scalar_type(max(shape)) + coords = np.empty((len(shape), len(linear_loc)), dtype=idx_dtype) strides = 1 for i, d in enumerate(shape[::-1]): coords[-(i + 1), :] = (linear_loc // strides) % d @@ -1451,7 +1451,7 @@ def asformat(self, format, compressed_axes=None): raise NotImplementedError("The given format is not supported.") -def as_coo(x, shape=None, fill_value=None, storage_dtype=None): +def as_coo(x, shape=None, fill_value=None, idx_dtype=None): """ Converts any given format to :obj:`COO`. See the "See Also" section for details. @@ -1490,7 +1490,7 @@ def as_coo(x, shape=None, fill_value=None, storage_dtype=None): return x.asformat("coo") if isinstance(x, np.ndarray): - return COO.from_numpy(x, fill_value=fill_value, storage_dtype=storage_dtype) + return COO.from_numpy(x, fill_value=fill_value, idx_dtype=idx_dtype) if isinstance(x, scipy.sparse.spmatrix): return COO.from_scipy_sparse(x) diff --git a/sparse/_utils.py b/sparse/_utils.py index 24c1f89d..ab7b92e9 100644 --- a/sparse/_utils.py +++ b/sparse/_utils.py @@ -85,7 +85,7 @@ def random( format="coo", compressed_axes=None, fill_value=None, - storage_dtype=None, + idx_dtype=None, ): """Generate a random sparse multidimensional array @@ -198,14 +198,12 @@ def random( fill_value=fill_value, ).reshape(shape) - if storage_dtype: - if can_store(storage_dtype, max(shape)): - ar.coords = ar.coords.astype(storage_dtype) + if idx_dtype: + if can_store(idx_dtype, max(shape)): + ar.coords = ar.coords.astype(idx_dtype) else: raise ValueError( - "cannot cast array with shape {} to dtype {}.".format( - shape, storage_dtype - ) + "cannot cast array with shape {} to dtype {}.".format(shape, idx_dtype) ) return ar.asformat(format, compressed_axes=compressed_axes) diff --git a/sparse/tests/test_compressed.py b/sparse/tests/test_compressed.py index 72a57053..c9b25290 100644 --- a/sparse/tests/test_compressed.py +++ b/sparse/tests/test_compressed.py @@ -435,20 +435,27 @@ def test_flatten(in_shape): def test_gcxs_valerr(): a = np.arange(300) with pytest.raises(ValueError): - GCXS.from_numpy(a, storage_dtype=np.int8) + GCXS.from_numpy(a, idx_dtype=np.int8) def test_upcast(): - a = sparse.random((50, 50, 50), density=0.1, format="coo", storage_dtype=np.uint8) + a = sparse.random((50, 50, 50), density=0.1, format="coo", idx_dtype=np.uint8) b = a.asformat("gcxs") assert b.indices.dtype == np.uint16 - a = sparse.random((8, 7, 6), density=0.5, format="gcxs", storage_dtype=np.uint8) + a = sparse.random((8, 7, 6), density=0.5, format="gcxs", idx_dtype=np.uint8) + b = sparse.random((6, 6, 6), density=0.8, format="gcxs", idx_dtype=np.uint8) assert sparse.concatenate((a, a)).indptr.dtype == np.uint16 - assert sparse.stack((a, a)).indptr.dtype == np.uint16 + assert sparse.stack((b, b)).indptr.dtype == np.uint16 def test_from_coo(): a = sparse.random((5, 5, 5), density=0.1, format="coo") b = GCXS(a) assert_eq(a, b) + + +def test_from_coo_valerr(): + a = sparse.random((25, 25, 25), density=0.01, format="coo") + with pytest.raises(ValueError): + GCXS.from_coo(a, idx_dtype=np.int8) diff --git a/sparse/tests/test_coo.py b/sparse/tests/test_coo.py index 342250d2..567cfb65 100644 --- a/sparse/tests/test_coo.py +++ b/sparse/tests/test_coo.py @@ -243,7 +243,7 @@ def test_resize(a, b): def test_resize_upcast(): - s = sparse.random((10, 10, 10), density=0.5, format="coo", storage_dtype=np.uint8) + s = sparse.random((10, 10, 10), density=0.5, format="coo", idx_dtype=np.uint8) s.resize(600) assert s.coords.dtype == np.uint16 @@ -378,7 +378,7 @@ def test_reshape_function(): def test_reshape_upcast(): - a = sparse.random((10, 10, 10), density=0.5, format="coo", storage_dtype=np.uint8) + a = sparse.random((10, 10, 10), density=0.5, format="coo", idx_dtype=np.uint8) assert a.reshape(1000).coords.dtype == np.uint16 @@ -1209,12 +1209,12 @@ def test_valerr(self, args): @pytest.mark.parametrize("dtype", [np.uint8, np.int8]) @pytest.mark.parametrize("shift", [300, -300]) def test_dtype_errors(self, dtype, shift): - x = sparse.random((5, 5, 5), density=0.2, storage_dtype=dtype) + x = sparse.random((5, 5, 5), density=0.2, idx_dtype=dtype) with pytest.raises(ValueError): sparse.roll(x, shift) def test_unsigned_type_error(self): - x = sparse.random((5, 5, 5), density=0.3, storage_dtype=np.uint8) + x = sparse.random((5, 5, 5), density=0.3, idx_dtype=np.uint8) with pytest.raises(ValueError): sparse.roll(x, -1) @@ -1600,4 +1600,9 @@ def test_astype_no_copy(): def test_coo_valerr(): a = np.arange(300) with pytest.raises(ValueError): - COO.from_numpy(a, storage_dtype=np.int8) + COO.from_numpy(a, idx_dtype=np.int8) + + +def test_random_idx_dtype(): + with pytest.raises(ValueError): + sparse.random((300,), density=0.1, format="coo", idx_dtype=np.int8)