Skip to content

Commit

Permalink
Merge pull request #23 from brightway-lca/sparse_matrix_mapping
Browse files Browse the repository at this point in the history
Sparse matrix mapping. Fixes #22
  • Loading branch information
cmutel authored Nov 1, 2024
2 parents 087757b + 22f7d27 commit d1ee2a7
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 46 deletions.
150 changes: 129 additions & 21 deletions dev/Mapping speed tests.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"source": [
"import itertools\n",
"import numpy as np\n",
"from scipy import sparse"
"from scipy import sparse\n",
"import matrix_utils as mu"
]
},
{
Expand Down Expand Up @@ -60,20 +61,120 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 14,
"id": "wired-lesbian",
"metadata": {},
"outputs": [],
"source": [
"def index_with_indexarray(array_from):\n",
" # Twice as fast as index_with_searchsorted\n",
" unique = np.unique(array_from)\n",
" values = np.arange(unique.max() + 1)\n",
" array = np.unique(array_from)\n",
" \n",
" if array_from.shape == (0,):\n",
" if empty_ok:\n",
" max_value = 0\n",
" else:\n",
" raise EmptyArray(\"Empty array can't be used to map values\")\n",
" else:\n",
" max_value = int(array.max())\n",
" index_array = np.zeros(max_value + 1) - 1\n",
" index_array[array] = np.arange(len(array))\n",
"\n",
" index_array = np.zeros_like(unique) - 1\n",
" index_array[unique] = values\n",
" result = np.zeros_like(array_from) - 1\n",
" mask = array_from <= max_value\n",
" result[mask] = index_array[array_from[mask]]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "b24eddaa-76e1-4bcc-8db9-4e648f564f13",
"metadata": {},
"outputs": [],
"source": [
"def index_with_sparse_matrix(array_from):\n",
" rows = np.unique(array_from)\n",
" cols = np.zeros_like(rows)\n",
" values = np.arange(1, len(rows) + 1)\n",
" matrix = sparse.coo_matrix((values, (rows, cols)), (rows[-1] + 1, 1)).tocsr()\n",
"\n",
" return index_array[array_from]"
" return (matrix[array_from, 0]).tocoo().data"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "b6a818cc-00fb-457f-b91d-14044c028efd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 3, 1, 2, 0])"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"array_from = np.array([4, 19000, 4, 8, 1])\n",
"index_with_sparse_matrix(array_from)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d5486625-8bab-4500-8167-66f5a8f851f3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<3488834974658x1 sparse array of type '<class 'numpy.int64'>'\n",
"\twith 4 stored elements in COOrdinate format>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inpt = np.array([1288834974657, 2288834974657, 3488834974657, 3288834974657])\n",
"array = np.unique(inpt)\n",
"max_value = array[-1]\n",
"max_index = len(array) - 1\n",
"matrix = sparse.coo_array(\n",
" (np.arange(1, len(array) + 1), (array, np.zeros_like(array))),\n",
" (max_value + 1, 1),\n",
") #.todok()\n",
"matrix"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "562a1c34-32b8-45fc-bf31-519d9b0a83f0",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'coo_array' object is not subscriptable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m result \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros_like(inpt) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 3\u001b[0m mask \u001b[38;5;241m=\u001b[39m inpt \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m max_value\n\u001b[0;32m----> 4\u001b[0m (\u001b[43mmatrix\u001b[49m\u001b[43m[\u001b[49m\u001b[43minpt\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmask\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mzeros_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43minpt\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmask\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m)\n",
"\u001b[0;31mTypeError\u001b[0m: 'coo_array' object is not subscriptable"
]
}
],
"source": [
"inpt = np.array([1288834974657, 2288834974657, 348883974657])\n",
"result = np.zeros_like(inpt) - 1\n",
"mask = inpt <= max_value\n",
"(matrix[inpt[mask], np.zeros_like(inpt[mask])])"
]
},
{
Expand Down Expand Up @@ -113,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"id": "little-marshall",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -163,36 +264,43 @@
},
{
"cell_type": "code",
"execution_count": 8,
"id": "parliamentary-period",
"execution_count": 15,
"id": "84f97b1b-eb9a-498b-bf24-979be3212fd4",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12.5 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"_, mapping = index_with_searchsorted(arr)\n",
"arr2 = input_array()"
"%timeit index_with_indexarray(arr)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "laughing-selection",
"execution_count": 44,
"id": "7491e454-ce31-4132-90c9-0d37f79e92c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5.29 ms ± 498 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"121 µs ± 352 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%timeit index_with_arrays(arr2, np.zeros_like(arr2), mapping)"
"%timeit index_with_sparse_matrix(array_from)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "systematic-shopping",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -290,7 +398,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -304,7 +412,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
"version": "3.10.14"
}
},
"nbformat": 4,
Expand Down
41 changes: 27 additions & 14 deletions matrix_utils/array_mapper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
from scipy import sparse

from .errors import EmptyArray

Expand All @@ -23,28 +24,31 @@ class ArrayMapper:
""" # NOQA: E501

def __init__(self, *, array: np.ndarray, sparse_cutoff: float = 0.1, empty_ok: bool = False):
def __init__(self, *, array: np.ndarray, sparse_cutoff: int = 50000, empty_ok: bool = False):
self._check_input_array(array)

# Even if already unique, this only adds ~2ms for 100.000 elements
self.array = np.unique(array)
self.empty_ok = empty_ok

# TODO
# Sparse matrices could be used if the number of values present is much less
# than the number of possible values, given the (min, max) range.
# The default code will generate a complete mapping for the (min, max)
# interval, which can use too much memory in certain cases.
# self.use_sparse = len(self.keys) / self.keys.max() <= sparse_cutoff:

if array.shape == (0,):
if empty_ok:
if self.array.shape == (0,):
if self.empty_ok:
self.empty_input = True
self.max_value = 0
self.max_index = 0
return
else:
raise EmptyArray("Empty array can't be used to map values")
else:
self.max_value = int(self.array.max())
self.index_array = np.zeros(self.max_value + 1) - 1
self.index_array[self.array] = np.arange(len(self.array))
self.empty_input = False
self.max_value = self.array[-1]
self.max_index = len(self.array) - 1

# Zero serves as a missing value, so start at one
self.matrix = sparse.coo_matrix(
(np.arange(1, len(self.array) + 1), (self.array, np.zeros_like(self.array))),
(self.max_value + 1, 1),
).tocsc()

def __len__(self):
return self.array.shape[0]
Expand All @@ -61,16 +65,25 @@ def map_array(self, array: np.ndarray) -> np.ndarray:
if array.shape == (0,):
# Empty array
return array.copy()
elif self.empty_input:
if self.empty_ok:
# Return all with missing flag
return np.zeros_like(array) - 1
else:
raise EmptyArray("Can't map with empty input array")

result = np.zeros_like(array) - 1
mask = array <= self.max_value
result[mask] = self.index_array[array[mask]]
# https://numpy.org/doc/stable/reference/generated/numpy.matrix.A1.html
result[mask] = (self.matrix[array[mask], np.zeros_like(array[mask])]).A1 - 1
return result

def to_dict(self) -> dict:
"""Turn the mapping arrays into a Python dict. This is only useful for
human examination, the normal implementation uses Numpy functions on the
arrays directly."""
if self.empty_input:
return {}
return {int(x): int(y) for x, y in zip(self.array, self.map_array(self.array))}

def reverse_dict(self) -> dict:
Expand Down
6 changes: 3 additions & 3 deletions matrix_utils/mapped_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,12 @@ def __init__(
col_indices = safe_concatenate_indices([obj.col_matrix for obj in self.groups], empty_ok)

if diagonal:
x = int(self.row_mapper.index_array.max() + 1)
x = 0 if self.row_mapper.empty_input else self.row_mapper.max_index + 1
dimensions = (x, x)
else:
dimensions = (
int(self.row_mapper.index_array.max() + 1),
int(self.col_mapper.index_array.max() + 1),
0 if self.row_mapper.empty_input else self.row_mapper.max_index + 1,
0 if self.col_mapper.empty_input else self.col_mapper.max_index + 1,
)

self.matrix = sparse.coo_matrix(
Expand Down
12 changes: 10 additions & 2 deletions tests/array_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,17 @@ def test_initial_setup():
assert np.allclose(am.map_array(inpt), expected)


def test_with_large_values():
inpt = np.array([1288834974657, 2288834974657, 3488834974657, 3288834974657])
am = ArrayMapper(array=inpt)
given = np.array([1288834974657, 228883474657, 3288834974657, 3488834974657])
expected = np.array([0, -1, 2, 3])
assert np.allclose(am.map_array(given), expected)


def test_float_indices_raises_error():
inpt = np.array([1, 2, 3, 6.0])
with pytest.raises(IndexError):
with pytest.raises(TypeError):
ArrayMapper(array=inpt)


Expand Down Expand Up @@ -69,4 +77,4 @@ def test_empty_array_error():
def test_empty_array_ok():
am = ArrayMapper(array=np.array([], dtype=int), empty_ok=True)
assert am.array.shape == (0,)
assert am.index_array.shape == (1,)
assert am.empty_input
2 changes: 0 additions & 2 deletions tests/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ def test_matrix_construction_overlapping_sum(smaller):
col = np.array([0, 1, 2, 3, 5, 4])
data = np.array([12, 16.3, 4, 25, 12.3, 125])
matrix = mm.matrix.tocoo()
print(matrix)
print(matrix.data)
assert np.allclose(matrix.row, row)
assert np.allclose(matrix.col, col)
assert np.allclose(matrix.data, data)
Expand Down
1 change: 0 additions & 1 deletion tests/mapped_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,6 @@ def test_input_index_vector(sensitivity_dps):
(942484272, 942484272, 1, 1),
]
for row in expected:
print(mm.input_indexer_vector())
assert np.allclose(mm.input_indexer_vector(), row)
try:
next(mm)
Expand Down
2 changes: 0 additions & 2 deletions tests/monte_carlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,6 @@ def test_distributions_reproducible():
results[:, :, i] = mm.matrix.toarray()

given = results.sum(axis=0).sum(axis=0).ravel()
print(given.shape)
print(given)
expected = np.array(
[
21.06909828,
Expand Down
1 change: 0 additions & 1 deletion tests/ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def test_ordering():
]
for dp in dps:
dp.rehydrate_interface("w-fourth", Interface())
print(list(dp.groups))

mm = MappedMatrix(packages=dps, matrix="matrix-a")
assert [grp.label for grp in mm.groups] == [
Expand Down

0 comments on commit d1ee2a7

Please sign in to comment.