pydata · hameerabbasi · Mar 16, 2021 · Mar 11, 2021 · Mar 13, 2021 · Mar 16, 2021
diff --git a/sparse/_compressed/common.py b/sparse/_compressed/common.py
@@ -1,5 +1,5 @@
 import numpy as np
-from .._utils import check_consistent_fill_value, normalize_axis
+from .._utils import check_consistent_fill_value, normalize_axis, can_store
 
 
 def concatenate(arrays, axis=0, compressed_axes=None):
@@ -41,6 +41,9 @@ def concatenate(arrays, axis=0, compressed_axes=None):
     data = np.concatenate([arr.data for arr in arrays])
     ptr_len = arrays[0].indptr.shape[0]
     nnz = arrays[0].nnz
+    total_nnz = sum([arr.nnz for arr in arrays[:-1]])
+    if not can_store(indptr.dtype, total_nnz):
+        indptr = indptr.astype(np.min_scalar_type(total_nnz))
     for i in range(1, len(arrays)):
         indptr[ptr_len:] += nnz
         nnz = arrays[i].nnz
@@ -93,6 +96,9 @@ def stack(arrays, axis=0, compressed_axes=None):
     data = np.concatenate([arr.data for arr in arrays])
     ptr_len = arrays[0].indptr.shape[0]
     nnz = arrays[0].nnz
+    total_nnz = sum([arr.nnz for arr in arrays[:-1]])
+    if not can_store(indptr.dtype, total_nnz):
+        indptr = indptr.astype(np.min_scalar_type(total_nnz))
     for i in range(1, len(arrays)):
         indptr[ptr_len:] += nnz
         nnz = arrays[i].nnz

diff --git a/sparse/_compressed/compressed.py b/sparse/_compressed/compressed.py
@@ -11,6 +11,7 @@
 from .._common import dot, matmul
 from .._utils import (
     normalize_axis,
+    can_store,
     check_zero_fill_value,
     check_compressed_axes,
     equivalent,
@@ -20,7 +21,7 @@
 from .indexing import getitem
 
 
-def _from_coo(x, compressed_axes=None):
+def _from_coo(x, compressed_axes=None, storage_dtype=None):
 
     if x.ndim == 0:
         if compressed_axes is not None:
@@ -49,16 +50,28 @@ def _from_coo(x, compressed_axes=None):
     compressed_shape = (row_size, col_size)
     shape = x.shape
 
+    if storage_dtype and not can_store(storage_dtype, max(compressed_shape)):
+        raise ValueError(
+            "cannot store array with the compressed shape of {} with dtype {}.".format(
+                compressed_shape, storage_dtype
+            )
+        )
+
+    if not storage_dtype:
+        storage_dtype = x.coords.dtype
+        if not can_store(storage_dtype, max(compressed_shape)):
+            storage_dtype = np.min_scalar_type(max(compressed_shape))
+
     # transpose axes, linearize, reshape, and compress
     linear = linear_loc(x.coords[axis_order], reordered_shape)
     order = np.argsort(linear)
     linear = linear[order]
-    coords = np.empty((2, x.nnz), dtype=np.intp)
+    coords = np.empty((2, x.nnz), dtype=storage_dtype)
     strides = 1
     for i, d in enumerate(compressed_shape[::-1]):
         coords[-(i + 1), :] = (linear // strides) % d
         strides *= d
-    indptr = np.empty(row_size + 1, dtype=np.intp)
+    indptr = np.empty(row_size + 1, dtype=storage_dtype)
     indptr[0] = 0
     np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:])
     indices = coords[1]
@@ -112,7 +125,13 @@ class GCXS(SparseArray, NDArrayOperatorsMixin):
     __array_priority__ = 12
 
     def __init__(
-        self, arg, shape=None, compressed_axes=None, prune=False, fill_value=0
+        self,
+        arg,
+        shape=None,
+        compressed_axes=None,
+        prune=False,
+        fill_value=0,
+        storage_dtype=None,
     ):
 
         if isinstance(arg, np.ndarray):
@@ -121,7 +140,9 @@ def __init__(
             )
 
         elif isinstance(arg, COO):
-            (arg, shape, compressed_axes, fill_value) = _from_coo(arg, compressed_axes)
+            (arg, shape, compressed_axes, fill_value) = _from_coo(
+                arg, compressed_axes, storage_dtype
+            )
 
         if shape is None:
             raise ValueError("missing `shape` argument")
@@ -157,13 +178,15 @@ def copy(self, deep=True):
         return _copy.deepcopy(self) if deep else _copy.copy(self)
 
     @classmethod
-    def from_numpy(cls, x, compressed_axes=None, fill_value=0):
-        coo = COO(x, fill_value=fill_value)
-        return cls.from_coo(coo, compressed_axes)
+    def from_numpy(cls, x, compressed_axes=None, fill_value=0, storage_dtype=None):
+        coo = COO(x, fill_value=fill_value, storage_dtype=storage_dtype)
+        return cls.from_coo(coo, compressed_axes, storage_dtype)
 
     @classmethod
-    def from_coo(cls, x, compressed_axes=None):
-        (arg, shape, compressed_axes, fill_value) = _from_coo(x, compressed_axes)
+    def from_coo(cls, x, compressed_axes=None, storage_dtype=None):
+        (arg, shape, compressed_axes, fill_value) = _from_coo(
+            x, compressed_axes, storage_dtype
+        )
         return cls(
             arg, shape=shape, compressed_axes=compressed_axes, fill_value=fill_value
         )
@@ -181,9 +204,13 @@ def from_scipy_sparse(cls, x):
             )
 
     @classmethod
-    def from_iter(cls, x, shape=None, compressed_axes=None, fill_value=None):
+    def from_iter(
+        cls, x, shape=None, compressed_axes=None, fill_value=None, storage_dtype=None
+    ):
         return cls.from_coo(
-            COO.from_iter(x, shape, fill_value), compressed_axes=compressed_axes
+            COO.from_iter(x, shape, fill_value),
+            compressed_axes,
+            storage_dtype,
         )
 
     @property
@@ -289,7 +316,7 @@ def _reduce_calc(self, method, axis, keepdims=False, **kwargs):
         x = self.change_compressed_axes(compressed_axes)
         idx = np.diff(x.indptr) != 0
         indptr = x.indptr[:-1][idx]
-        indices = (np.arange(x._compressed_shape[0], dtype=np.intp))[idx]
+        indices = (np.arange(x._compressed_shape[0], dtype=self.indptr.dtype))[idx]
         data = method.reduceat(x.data, indptr, **kwargs)
         counts = x.indptr[1:][idx] - x.indptr[:-1][idx]
         arr_attrs = (x, compressed_axes, indices)
@@ -782,7 +809,7 @@ def _prune(self):
             coords = coords[:, mask]
             self.indices = coords[1]
             row_size = self._compressed_shape[0]
-            indptr = np.empty(row_size + 1, dtype=np.intp)
+            indptr = np.empty(row_size + 1, dtype=self.indptr.dtype)
             indptr[0] = 0
             np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:])
             self.indptr = indptr

diff --git a/sparse/_compressed/convert.py b/sparse/_compressed/convert.py
@@ -1,25 +1,25 @@
 import numpy as np
 import numba
 import operator
-from .._utils import check_compressed_axes
+from .._utils import check_compressed_axes, get_out_dtype
 from .._coo.common import linear_loc
 from functools import reduce
 from numba.typed import List
 
 
-def convert_to_flat(inds, shape):
+def convert_to_flat(inds, shape, dtype):
     """
     Converts the indices of either the compressed or uncompressed axes
     into a linearized form. Prepares the inputs for compute_flat.
     """
     inds = [np.array(ind) for ind in inds]
     if any(ind.ndim > 1 for ind in inds):
         raise IndexError("Only one-dimensional iterable indices supported.")
-    cols = np.empty(np.prod([ind.size for ind in inds]), dtype=np.intp)
+    cols = np.empty(np.prod([ind.size for ind in inds]), dtype=dtype)
     shape_bins = transform_shape(np.asarray(shape))
     increments = List()
     for i in range(len(inds)):
-        increments.append((inds[i] * shape_bins[i]).astype(np.int32))
+        increments.append((inds[i] * shape_bins[i]).astype(dtype))
     operations = np.prod([ind.shape[0] for ind in increments[:-1]])
     return compute_flat(increments, cols, operations)
 
@@ -67,7 +67,7 @@ def transform_shape(shape):  # pragma: no cover
 @numba.jit(nopython=True, nogil=True)
 def uncompress_dimension(indptr):  # pragma: no cover
     """converts an index pointer array into an array of coordinates"""
-    uncompressed = np.empty(indptr[-1], dtype=np.intp)
+    uncompressed = np.empty(indptr[-1], dtype=indptr.dtype)
     for i in range(len(indptr) - 1):
         uncompressed[indptr[i] : indptr[i + 1]] = i
     return uncompressed
@@ -123,7 +123,8 @@ def _1d_reshape(x, shape, compressed_axes):
     x_indices = x.indices[:end_idx]
     new_nnz = x_indices.size
     new_linear = np.empty(new_nnz, dtype=np.intp)
-    new_coords = np.empty((2, new_nnz), dtype=np.intp)
+    coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape))
+    new_coords = np.empty((2, new_nnz), dtype=coords_dtype)
 
     _linearize(
         x_indices,
@@ -137,7 +138,7 @@ def _1d_reshape(x, shape, compressed_axes):
 
     order = np.argsort(new_linear)
     new_coords = new_coords[:, order]
-    indptr = np.empty(row_size + 1, dtype=np.intp)
+    indptr = np.empty(row_size + 1, dtype=coords_dtype)
     indptr[0] = 0
     np.cumsum(np.bincount(new_coords[0], minlength=row_size), out=indptr[1:])
     indices = new_coords[1]
@@ -162,7 +163,8 @@ def _resize(x, shape, compressed_axes):
     coords = np.stack((uncompressed, x.indices))
     linear = linear_loc(coords, x._compressed_shape)
     sorted_axis_order = np.argsort(x._axis_order)
-    c_linear = np.empty(x.nnz, dtype=np.intp)
+    linear_dtype = get_out_dtype(x.indices, np.prod(shape))
+    c_linear = np.empty(x.nnz, dtype=linear_dtype)
 
     _c_ordering(
         linear,
@@ -203,7 +205,8 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False):
     linear = linear_loc(coords, x._compressed_shape)
     sorted_axis_order = np.argsort(x._axis_order)
     if len(shape) == 1:
-        c_linear = np.empty(x.nnz, dtype=np.intp)
+        dtype = get_out_dtype(x.indices, shape[0])
+        c_linear = np.empty(x.nnz, dtype=dtype)
         _c_ordering(
             linear,
             c_linear,
@@ -220,11 +223,12 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False):
     new_axis_order.extend(np.setdiff1d(np.arange(len(shape)), compressed_axes))
     new_linear = np.empty(x.nnz, dtype=np.intp)
     new_reordered_shape = np.array(shape)[new_axis_order]
-    new_coords = np.empty((2, x.nnz), dtype=np.intp)
     axisptr = len(compressed_axes)
     row_size = np.prod(new_reordered_shape[:axisptr])
     col_size = np.prod(new_reordered_shape[axisptr:])
     new_compressed_shape = np.array((row_size, col_size))
+    coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape))
+    new_coords = np.empty((2, x.nnz), dtype=coords_dtype)
 
     _convert_coords(
         linear,
@@ -247,7 +251,7 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False):
         indptr = []
         indices = coords[0, :]
     else:
-        indptr = np.empty(row_size + 1, dtype=np.intp)
+        indptr = np.empty(row_size + 1, dtype=coords_dtype)
         indptr[0] = 0
         np.cumsum(np.bincount(new_coords[0], minlength=row_size), out=indptr[1:])
         indices = new_coords[1]

diff --git a/sparse/_compressed/indexing.py b/sparse/_compressed/indexing.py
@@ -89,13 +89,17 @@ def getitem(x, key):
     # convert all indices of compressed axes to a single array index
     # this tells us which 'rows' of the underlying csr matrix to iterate through
     rows = convert_to_flat(
-        reordered_key[: x._axisptr], x._reordered_shape[: x._axisptr]
+        reordered_key[: x._axisptr],
+        x._reordered_shape[: x._axisptr],
+        x.indices.dtype,
     )
 
     # convert all indices of uncompressed axes to a single array index
     # this tells us which 'columns' of the underlying csr matrix to iterate through
     cols = convert_to_flat(
-        reordered_key[x._axisptr :], x._reordered_shape[x._axisptr :]
+        reordered_key[x._axisptr :],
+        x._reordered_shape[x._axisptr :],
+        x.indices.dtype,
     )
 
     starts = x.indptr[:-1][rows]  # find the start and end of each of the rows
@@ -117,7 +121,7 @@ def getitem(x, key):
         compressed_axes = (0,)  # defaults to 0
         row_size = starts.size
 
-    indptr = np.empty(row_size + 1, dtype=np.intp)
+    indptr = np.empty(row_size + 1, dtype=x.indptr.dtype)
     indptr[0] = 0
     if pos_slice:
         arg = get_slicing_selection(x.data, x.indices, indptr, starts, ends, cols)
@@ -134,7 +138,7 @@ def getitem(x, key):
             indptr = None
         else:
             indices = uncompressed % size
-            indptr = np.empty(shape[0] + 1, dtype=np.intp)
+            indptr = np.empty(shape[0] + 1, dtype=x.indptr.dtype)
             indptr[0] = 0
             np.cumsum(
                 np.bincount(uncompressed // size, minlength=shape[0]), out=indptr[1:]
@@ -144,7 +148,7 @@ def getitem(x, key):
             indptr = None
         else:
             uncompressed = indices // size
-            indptr = np.empty(shape[0] + 1, dtype=np.intp)
+            indptr = np.empty(shape[0] + 1, dtype=x.indptr.dtype)
             indptr[0] = 0
             np.cumsum(np.bincount(uncompressed, minlength=shape[0]), out=indptr[1:])
             indices = indices % size
@@ -230,7 +234,7 @@ def get_slicing_selection(
             ind_list.extend(inds)
             indptr[i + 1] = indptr[i] + len(inds)
     ind_list = np.array(ind_list, dtype=np.int64)
-    indices = np.array(indices)
+    indices = np.array(indices, dtype=indptr.dtype)
     data = arr_data[ind_list]
     return (data, indices, indptr)
 
@@ -261,7 +265,7 @@ def get_array_selection(
         ind_list.extend(inds)
         indptr[i + 1] = indptr[i] + len(inds)
     ind_list = np.array(ind_list, dtype=np.int64)
-    indices = np.array(indices)
+    indices = np.array(indices, dtype=indptr.dtype)
     data = arr_data[ind_list]
     return (data, indices, indptr)
 

diff --git a/sparse/_coo/common.py b/sparse/_coo/common.py
@@ -10,9 +10,11 @@
 from .._sparse_array import SparseArray
 from .._utils import (
     isscalar,
+    is_unsigned_dtype,
     normalize_axis,
     check_zero_fill_value,
     check_consistent_fill_value,
+    can_store,
 )
 
 
@@ -173,6 +175,8 @@ def concatenate(arrays, axis=0):
     data = np.concatenate([x.data for x in arrays])
     coords = np.concatenate([x.coords for x in arrays], axis=1)
 
+    if not can_store(coords.dtype, max(shape)):
+        coords = coords.astype(np.min_scalar_type(max(shape)))
     dim = 0
     for x in arrays:
         if dim:
@@ -688,6 +692,7 @@ def roll(a, shift, axis=None):
         Output array, with the same shape as a.
     """
     from .core import COO, as_coo
+    from numpy.core._exceptions import UFuncTypeError
 
     a = as_coo(a)
 
@@ -719,11 +724,27 @@ def roll(a, shift, axis=None):
                 "If 'shift' is a 1D sequence, " "'axis' must have equal length."
             )
 
+        if not can_store(a.coords.dtype, max(a.shape + shift)):
+            raise ValueError(
+                "cannot roll with coords.dtype {} and shift {}. Try casting coords to a larger dtype.".format(
+                    a.coords.dtype,
+                    shift,
+                )
+            )
+
         # shift elements
         coords, data = np.copy(a.coords), np.copy(a.data)
-        for sh, ax in zip(shift, axis):
-            coords[ax] += sh
-            coords[ax] %= a.shape[ax]
+        try:
+            for sh, ax in zip(shift, axis):
+                coords[ax] += sh
+                coords[ax] %= a.shape[ax]
+        except UFuncTypeError:
+            if is_unsigned_dtype(coords.dtype):
+                raise ValueError(
+                    "rolling with coords.dtype as {} is not safe. Try using a signed dtype.".format(
+                        coords.dtype
+                    )
+                )
 
         return COO(
             coords,