Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow for use of different dtypes in coords, indices, and indptr #441

Merged
merged 3 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion sparse/_compressed/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy as np
from .._utils import check_consistent_fill_value, normalize_axis
from .._utils import check_consistent_fill_value, normalize_axis, can_store


def concatenate(arrays, axis=0, compressed_axes=None):
Expand Down Expand Up @@ -41,6 +41,9 @@ def concatenate(arrays, axis=0, compressed_axes=None):
data = np.concatenate([arr.data for arr in arrays])
ptr_len = arrays[0].indptr.shape[0]
nnz = arrays[0].nnz
total_nnz = sum([arr.nnz for arr in arrays[:-1]])
daletovar marked this conversation as resolved.
Show resolved Hide resolved
if not can_store(indptr.dtype, total_nnz):
indptr = indptr.astype(np.min_scalar_type(total_nnz))
for i in range(1, len(arrays)):
indptr[ptr_len:] += nnz
nnz = arrays[i].nnz
Expand Down Expand Up @@ -93,6 +96,9 @@ def stack(arrays, axis=0, compressed_axes=None):
data = np.concatenate([arr.data for arr in arrays])
ptr_len = arrays[0].indptr.shape[0]
nnz = arrays[0].nnz
total_nnz = sum([arr.nnz for arr in arrays[:-1]])
if not can_store(indptr.dtype, total_nnz):
indptr = indptr.astype(np.min_scalar_type(total_nnz))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This branch needs hitting or removing.

for i in range(1, len(arrays)):
indptr[ptr_len:] += nnz
nnz = arrays[i].nnz
Expand Down
55 changes: 41 additions & 14 deletions sparse/_compressed/compressed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .._common import dot, matmul
from .._utils import (
normalize_axis,
can_store,
check_zero_fill_value,
check_compressed_axes,
equivalent,
Expand All @@ -20,7 +21,7 @@
from .indexing import getitem


def _from_coo(x, compressed_axes=None):
def _from_coo(x, compressed_axes=None, storage_dtype=None):

if x.ndim == 0:
if compressed_axes is not None:
Expand Down Expand Up @@ -49,16 +50,28 @@ def _from_coo(x, compressed_axes=None):
compressed_shape = (row_size, col_size)
shape = x.shape

if storage_dtype and not can_store(storage_dtype, max(compressed_shape)):
raise ValueError(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So does this one.

"cannot store array with the compressed shape of {} with dtype {}.".format(
compressed_shape, storage_dtype
)
)

if not storage_dtype:
storage_dtype = x.coords.dtype
if not can_store(storage_dtype, max(compressed_shape)):
storage_dtype = np.min_scalar_type(max(compressed_shape))

# transpose axes, linearize, reshape, and compress
linear = linear_loc(x.coords[axis_order], reordered_shape)
order = np.argsort(linear)
linear = linear[order]
coords = np.empty((2, x.nnz), dtype=np.intp)
coords = np.empty((2, x.nnz), dtype=storage_dtype)
strides = 1
for i, d in enumerate(compressed_shape[::-1]):
coords[-(i + 1), :] = (linear // strides) % d
strides *= d
indptr = np.empty(row_size + 1, dtype=np.intp)
indptr = np.empty(row_size + 1, dtype=storage_dtype)
indptr[0] = 0
np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:])
indices = coords[1]
Expand Down Expand Up @@ -112,7 +125,13 @@ class GCXS(SparseArray, NDArrayOperatorsMixin):
__array_priority__ = 12

def __init__(
self, arg, shape=None, compressed_axes=None, prune=False, fill_value=0
self,
arg,
shape=None,
compressed_axes=None,
prune=False,
fill_value=0,
storage_dtype=None,
):

if isinstance(arg, np.ndarray):
Expand All @@ -121,7 +140,9 @@ def __init__(
)

elif isinstance(arg, COO):
(arg, shape, compressed_axes, fill_value) = _from_coo(arg, compressed_axes)
(arg, shape, compressed_axes, fill_value) = _from_coo(
arg, compressed_axes, storage_dtype
)

if shape is None:
raise ValueError("missing `shape` argument")
Expand Down Expand Up @@ -157,13 +178,15 @@ def copy(self, deep=True):
return _copy.deepcopy(self) if deep else _copy.copy(self)

@classmethod
def from_numpy(cls, x, compressed_axes=None, fill_value=0):
coo = COO(x, fill_value=fill_value)
return cls.from_coo(coo, compressed_axes)
def from_numpy(cls, x, compressed_axes=None, fill_value=0, storage_dtype=None):
coo = COO(x, fill_value=fill_value, storage_dtype=storage_dtype)
return cls.from_coo(coo, compressed_axes, storage_dtype)

@classmethod
def from_coo(cls, x, compressed_axes=None):
(arg, shape, compressed_axes, fill_value) = _from_coo(x, compressed_axes)
def from_coo(cls, x, compressed_axes=None, storage_dtype=None):
(arg, shape, compressed_axes, fill_value) = _from_coo(
x, compressed_axes, storage_dtype
)
return cls(
arg, shape=shape, compressed_axes=compressed_axes, fill_value=fill_value
)
Expand All @@ -181,9 +204,13 @@ def from_scipy_sparse(cls, x):
)

@classmethod
def from_iter(cls, x, shape=None, compressed_axes=None, fill_value=None):
def from_iter(
cls, x, shape=None, compressed_axes=None, fill_value=None, storage_dtype=None
):
return cls.from_coo(
COO.from_iter(x, shape, fill_value), compressed_axes=compressed_axes
COO.from_iter(x, shape, fill_value),
compressed_axes,
storage_dtype,
)

@property
Expand Down Expand Up @@ -289,7 +316,7 @@ def _reduce_calc(self, method, axis, keepdims=False, **kwargs):
x = self.change_compressed_axes(compressed_axes)
idx = np.diff(x.indptr) != 0
indptr = x.indptr[:-1][idx]
indices = (np.arange(x._compressed_shape[0], dtype=np.intp))[idx]
indices = (np.arange(x._compressed_shape[0], dtype=self.indptr.dtype))[idx]
data = method.reduceat(x.data, indptr, **kwargs)
counts = x.indptr[1:][idx] - x.indptr[:-1][idx]
arr_attrs = (x, compressed_axes, indices)
Expand Down Expand Up @@ -782,7 +809,7 @@ def _prune(self):
coords = coords[:, mask]
self.indices = coords[1]
row_size = self._compressed_shape[0]
indptr = np.empty(row_size + 1, dtype=np.intp)
indptr = np.empty(row_size + 1, dtype=self.indptr.dtype)
indptr[0] = 0
np.cumsum(np.bincount(coords[0], minlength=row_size), out=indptr[1:])
self.indptr = indptr
Expand Down
26 changes: 15 additions & 11 deletions sparse/_compressed/convert.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
import numpy as np
import numba
import operator
from .._utils import check_compressed_axes
from .._utils import check_compressed_axes, get_out_dtype
from .._coo.common import linear_loc
from functools import reduce
from numba.typed import List


def convert_to_flat(inds, shape):
def convert_to_flat(inds, shape, dtype):
"""
Converts the indices of either the compressed or uncompressed axes
into a linearized form. Prepares the inputs for compute_flat.
"""
inds = [np.array(ind) for ind in inds]
if any(ind.ndim > 1 for ind in inds):
raise IndexError("Only one-dimensional iterable indices supported.")
cols = np.empty(np.prod([ind.size for ind in inds]), dtype=np.intp)
cols = np.empty(np.prod([ind.size for ind in inds]), dtype=dtype)
shape_bins = transform_shape(np.asarray(shape))
increments = List()
for i in range(len(inds)):
increments.append((inds[i] * shape_bins[i]).astype(np.int32))
increments.append((inds[i] * shape_bins[i]).astype(dtype))
operations = np.prod([ind.shape[0] for ind in increments[:-1]])
return compute_flat(increments, cols, operations)

Expand Down Expand Up @@ -67,7 +67,7 @@ def transform_shape(shape): # pragma: no cover
@numba.jit(nopython=True, nogil=True)
def uncompress_dimension(indptr): # pragma: no cover
"""converts an index pointer array into an array of coordinates"""
uncompressed = np.empty(indptr[-1], dtype=np.intp)
uncompressed = np.empty(indptr[-1], dtype=indptr.dtype)
for i in range(len(indptr) - 1):
uncompressed[indptr[i] : indptr[i + 1]] = i
return uncompressed
Expand Down Expand Up @@ -123,7 +123,8 @@ def _1d_reshape(x, shape, compressed_axes):
x_indices = x.indices[:end_idx]
new_nnz = x_indices.size
new_linear = np.empty(new_nnz, dtype=np.intp)
new_coords = np.empty((2, new_nnz), dtype=np.intp)
coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape))
new_coords = np.empty((2, new_nnz), dtype=coords_dtype)

_linearize(
x_indices,
Expand All @@ -137,7 +138,7 @@ def _1d_reshape(x, shape, compressed_axes):

order = np.argsort(new_linear)
new_coords = new_coords[:, order]
indptr = np.empty(row_size + 1, dtype=np.intp)
indptr = np.empty(row_size + 1, dtype=coords_dtype)
indptr[0] = 0
np.cumsum(np.bincount(new_coords[0], minlength=row_size), out=indptr[1:])
indices = new_coords[1]
Expand All @@ -162,7 +163,8 @@ def _resize(x, shape, compressed_axes):
coords = np.stack((uncompressed, x.indices))
linear = linear_loc(coords, x._compressed_shape)
sorted_axis_order = np.argsort(x._axis_order)
c_linear = np.empty(x.nnz, dtype=np.intp)
linear_dtype = get_out_dtype(x.indices, np.prod(shape))
c_linear = np.empty(x.nnz, dtype=linear_dtype)

_c_ordering(
linear,
Expand Down Expand Up @@ -203,7 +205,8 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False):
linear = linear_loc(coords, x._compressed_shape)
sorted_axis_order = np.argsort(x._axis_order)
if len(shape) == 1:
c_linear = np.empty(x.nnz, dtype=np.intp)
dtype = get_out_dtype(x.indices, shape[0])
c_linear = np.empty(x.nnz, dtype=dtype)
_c_ordering(
linear,
c_linear,
Expand All @@ -220,11 +223,12 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False):
new_axis_order.extend(np.setdiff1d(np.arange(len(shape)), compressed_axes))
new_linear = np.empty(x.nnz, dtype=np.intp)
new_reordered_shape = np.array(shape)[new_axis_order]
new_coords = np.empty((2, x.nnz), dtype=np.intp)
axisptr = len(compressed_axes)
row_size = np.prod(new_reordered_shape[:axisptr])
col_size = np.prod(new_reordered_shape[axisptr:])
new_compressed_shape = np.array((row_size, col_size))
coords_dtype = get_out_dtype(x.indices, max(new_compressed_shape))
new_coords = np.empty((2, x.nnz), dtype=coords_dtype)

_convert_coords(
linear,
Expand All @@ -247,7 +251,7 @@ def _transpose(x, shape, axes, compressed_axes, transpose=False):
indptr = []
indices = coords[0, :]
else:
indptr = np.empty(row_size + 1, dtype=np.intp)
indptr = np.empty(row_size + 1, dtype=coords_dtype)
indptr[0] = 0
np.cumsum(np.bincount(new_coords[0], minlength=row_size), out=indptr[1:])
indices = new_coords[1]
Expand Down
18 changes: 11 additions & 7 deletions sparse/_compressed/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,17 @@ def getitem(x, key):
# convert all indices of compressed axes to a single array index
# this tells us which 'rows' of the underlying csr matrix to iterate through
rows = convert_to_flat(
reordered_key[: x._axisptr], x._reordered_shape[: x._axisptr]
reordered_key[: x._axisptr],
x._reordered_shape[: x._axisptr],
x.indices.dtype,
)

# convert all indices of uncompressed axes to a single array index
# this tells us which 'columns' of the underlying csr matrix to iterate through
cols = convert_to_flat(
reordered_key[x._axisptr :], x._reordered_shape[x._axisptr :]
reordered_key[x._axisptr :],
x._reordered_shape[x._axisptr :],
x.indices.dtype,
)

starts = x.indptr[:-1][rows] # find the start and end of each of the rows
Expand All @@ -117,7 +121,7 @@ def getitem(x, key):
compressed_axes = (0,) # defaults to 0
row_size = starts.size

indptr = np.empty(row_size + 1, dtype=np.intp)
indptr = np.empty(row_size + 1, dtype=x.indptr.dtype)
indptr[0] = 0
if pos_slice:
arg = get_slicing_selection(x.data, x.indices, indptr, starts, ends, cols)
Expand All @@ -134,7 +138,7 @@ def getitem(x, key):
indptr = None
else:
indices = uncompressed % size
indptr = np.empty(shape[0] + 1, dtype=np.intp)
indptr = np.empty(shape[0] + 1, dtype=x.indptr.dtype)
indptr[0] = 0
np.cumsum(
np.bincount(uncompressed // size, minlength=shape[0]), out=indptr[1:]
Expand All @@ -144,7 +148,7 @@ def getitem(x, key):
indptr = None
else:
uncompressed = indices // size
indptr = np.empty(shape[0] + 1, dtype=np.intp)
indptr = np.empty(shape[0] + 1, dtype=x.indptr.dtype)
indptr[0] = 0
np.cumsum(np.bincount(uncompressed, minlength=shape[0]), out=indptr[1:])
indices = indices % size
Expand Down Expand Up @@ -230,7 +234,7 @@ def get_slicing_selection(
ind_list.extend(inds)
indptr[i + 1] = indptr[i] + len(inds)
ind_list = np.array(ind_list, dtype=np.int64)
indices = np.array(indices)
indices = np.array(indices, dtype=indptr.dtype)
data = arr_data[ind_list]
return (data, indices, indptr)

Expand Down Expand Up @@ -261,7 +265,7 @@ def get_array_selection(
ind_list.extend(inds)
indptr[i + 1] = indptr[i] + len(inds)
ind_list = np.array(ind_list, dtype=np.int64)
indices = np.array(indices)
indices = np.array(indices, dtype=indptr.dtype)
data = arr_data[ind_list]
return (data, indices, indptr)

Expand Down
27 changes: 24 additions & 3 deletions sparse/_coo/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from .._sparse_array import SparseArray
from .._utils import (
isscalar,
is_unsigned_dtype,
normalize_axis,
check_zero_fill_value,
check_consistent_fill_value,
can_store,
)


Expand Down Expand Up @@ -173,6 +175,8 @@ def concatenate(arrays, axis=0):
data = np.concatenate([x.data for x in arrays])
coords = np.concatenate([x.coords for x in arrays], axis=1)

if not can_store(coords.dtype, max(shape)):
coords = coords.astype(np.min_scalar_type(max(shape)))
dim = 0
for x in arrays:
if dim:
Expand Down Expand Up @@ -688,6 +692,7 @@ def roll(a, shift, axis=None):
Output array, with the same shape as a.
"""
from .core import COO, as_coo
from numpy.core._exceptions import UFuncTypeError

a = as_coo(a)

Expand Down Expand Up @@ -719,11 +724,27 @@ def roll(a, shift, axis=None):
"If 'shift' is a 1D sequence, " "'axis' must have equal length."
)

if not can_store(a.coords.dtype, max(a.shape + shift)):
raise ValueError(
"cannot roll with coords.dtype {} and shift {}. Try casting coords to a larger dtype.".format(
a.coords.dtype,
shift,
)
)

# shift elements
coords, data = np.copy(a.coords), np.copy(a.data)
for sh, ax in zip(shift, axis):
coords[ax] += sh
coords[ax] %= a.shape[ax]
try:
for sh, ax in zip(shift, axis):
coords[ax] += sh
coords[ax] %= a.shape[ax]
except UFuncTypeError:
if is_unsigned_dtype(coords.dtype):
raise ValueError(
"rolling with coords.dtype as {} is not safe. Try using a signed dtype.".format(
coords.dtype
)
)

return COO(
coords,
Expand Down
Loading