Merge pull request #23 from brightway-lca/sparse_matrix_mapping

Sparse matrix mapping. Fixes #22
brightway-lca · Nov 1, 2024 · d1ee2a7 · d1ee2a7
2 parents 087757b + 22f7d27
commit d1ee2a7
Show file tree

Hide file tree

Showing 8 changed files with 169 additions and 46 deletions.
diff --git a/dev/Mapping speed tests.ipynb b/dev/Mapping speed tests.ipynb
@@ -9,7 +9,8 @@
    "source": [
     "import itertools\n",
     "import numpy as np\n",
-    "from scipy import sparse"
+    "from scipy import sparse\n",
+    "import matrix_utils as mu"
    ]
   },
   {
@@ -60,20 +61,120 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 14,
    "id": "wired-lesbian",
    "metadata": {},
    "outputs": [],
    "source": [
     "def index_with_indexarray(array_from):\n",
-    "    # Twice as fast as index_with_searchsorted\n",
-    "    unique = np.unique(array_from)\n",
-    "    values = np.arange(unique.max() + 1)\n",
+    "    array = np.unique(array_from)\n",
+    "    \n",
+    "    if array_from.shape == (0,):\n",
+    "        if empty_ok:\n",
+    "            max_value = 0\n",
+    "        else:\n",
+    "            raise EmptyArray(\"Empty array can't be used to map values\")\n",
+    "    else:\n",
+    "        max_value = int(array.max())\n",
+    "    index_array = np.zeros(max_value + 1) - 1\n",
+    "    index_array[array] = np.arange(len(array))\n",
     "\n",
-    "    index_array = np.zeros_like(unique) - 1\n",
-    "    index_array[unique] = values\n",
+    "    result = np.zeros_like(array_from) - 1\n",
+    "    mask = array_from <= max_value\n",
+    "    result[mask] = index_array[array_from[mask]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "b24eddaa-76e1-4bcc-8db9-4e648f564f13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def index_with_sparse_matrix(array_from):\n",
+    "    rows = np.unique(array_from)\n",
+    "    cols = np.zeros_like(rows)\n",
+    "    values = np.arange(1, len(rows) + 1)\n",
+    "    matrix = sparse.coo_matrix((values, (rows, cols)), (rows[-1] + 1, 1)).tocsr()\n",
     "\n",
-    "    return index_array[array_from]"
+    "    return (matrix[array_from, 0]).tocoo().data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "b6a818cc-00fb-457f-b91d-14044c028efd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 3, 1, 2, 0])"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "array_from = np.array([4, 19000, 4, 8, 1])\n",
+    "index_with_sparse_matrix(array_from)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d5486625-8bab-4500-8167-66f5a8f851f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<3488834974658x1 sparse array of type '<class 'numpy.int64'>'\n",
+       "\twith 4 stored elements in COOrdinate format>"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inpt = np.array([1288834974657, 2288834974657, 3488834974657, 3288834974657])\n",
+    "array = np.unique(inpt)\n",
+    "max_value = array[-1]\n",
+    "max_index = len(array) - 1\n",
+    "matrix = sparse.coo_array(\n",
+    "    (np.arange(1, len(array) + 1), (array, np.zeros_like(array))),\n",
+    "    (max_value + 1, 1),\n",
+    ") #.todok()\n",
+    "matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "562a1c34-32b8-45fc-bf31-519d9b0a83f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "'coo_array' object is not subscriptable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m result \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros_like(inpt) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m      3\u001b[0m mask \u001b[38;5;241m=\u001b[39m inpt \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m max_value\n\u001b[0;32m----> 4\u001b[0m (\u001b[43mmatrix\u001b[49m\u001b[43m[\u001b[49m\u001b[43minpt\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmask\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mzeros_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43minpt\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmask\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m)\n",
+      "\u001b[0;31mTypeError\u001b[0m: 'coo_array' object is not subscriptable"
+     ]
+    }
+   ],
+   "source": [
+    "inpt = np.array([1288834974657, 2288834974657, 348883974657])\n",
+    "result = np.zeros_like(inpt) - 1\n",
+    "mask = inpt <= max_value\n",
+    "(matrix[inpt[mask], np.zeros_like(inpt[mask])])"
    ]
   },
   {
@@ -113,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "little-marshall",
    "metadata": {},
    "outputs": [],
@@ -163,36 +264,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "parliamentary-period",
+   "execution_count": 15,
+   "id": "84f97b1b-eb9a-498b-bf24-979be3212fd4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "12.5 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
    "source": [
-    "_, mapping = index_with_searchsorted(arr)\n",
-    "arr2 = input_array()"
+    "%timeit index_with_indexarray(arr)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "laughing-selection",
+   "execution_count": 44,
+   "id": "7491e454-ce31-4132-90c9-0d37f79e92c8",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5.29 ms ± 498 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "121 µs ± 352 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
      ]
     }
    ],
    "source": [
-    "%timeit index_with_arrays(arr2, np.zeros_like(arr2), mapping)"
+    "%timeit index_with_sparse_matrix(array_from)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "systematic-shopping",
    "metadata": {},
    "outputs": [],
@@ -290,7 +398,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -304,7 +412,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.9"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/matrix_utils/array_mapper.py b/matrix_utils/array_mapper.py
@@ -1,4 +1,5 @@
 import numpy as np
+from scipy import sparse
 
 from .errors import EmptyArray
 
@@ -23,28 +24,31 @@ class ArrayMapper:
 
     """  # NOQA: E501
 
-    def __init__(self, *, array: np.ndarray, sparse_cutoff: float = 0.1, empty_ok: bool = False):
+    def __init__(self, *, array: np.ndarray, sparse_cutoff: int = 50000, empty_ok: bool = False):
         self._check_input_array(array)
 
         # Even if already unique, this only adds ~2ms for 100.000 elements
         self.array = np.unique(array)
+        self.empty_ok = empty_ok
 
-        # TODO
-        # Sparse matrices could be used if the number of values present is much less
-        # than the number of possible values, given the (min, max) range.
-        # The default code will generate a complete mapping for the (min, max)
-        # interval, which can use too much memory in certain cases.
-        # self.use_sparse = len(self.keys) / self.keys.max() <= sparse_cutoff:
-
-        if array.shape == (0,):
-            if empty_ok:
+        if self.array.shape == (0,):
+            if self.empty_ok:
+                self.empty_input = True
                 self.max_value = 0
+                self.max_index = 0
+                return
             else:
                 raise EmptyArray("Empty array can't be used to map values")
         else:
-            self.max_value = int(self.array.max())
-        self.index_array = np.zeros(self.max_value + 1) - 1
-        self.index_array[self.array] = np.arange(len(self.array))
+            self.empty_input = False
+            self.max_value = self.array[-1]
+            self.max_index = len(self.array) - 1
+
+        # Zero serves as a missing value, so start at one
+        self.matrix = sparse.coo_matrix(
+            (np.arange(1, len(self.array) + 1), (self.array, np.zeros_like(self.array))),
+            (self.max_value + 1, 1),
+        ).tocsc()
 
     def __len__(self):
         return self.array.shape[0]
@@ -61,16 +65,25 @@ def map_array(self, array: np.ndarray) -> np.ndarray:
         if array.shape == (0,):
             # Empty array
             return array.copy()
+        elif self.empty_input:
+            if self.empty_ok:
+                # Return all with missing flag
+                return np.zeros_like(array) - 1
+            else:
+                raise EmptyArray("Can't map with empty input array")
 
         result = np.zeros_like(array) - 1
         mask = array <= self.max_value
-        result[mask] = self.index_array[array[mask]]
+        # https://numpy.org/doc/stable/reference/generated/numpy.matrix.A1.html
+        result[mask] = (self.matrix[array[mask], np.zeros_like(array[mask])]).A1 - 1
         return result
 
     def to_dict(self) -> dict:
         """Turn the mapping arrays into a Python dict. This is only useful for
         human examination, the normal implementation uses Numpy functions on the
         arrays directly."""
+        if self.empty_input:
+            return {}
         return {int(x): int(y) for x, y in zip(self.array, self.map_array(self.array))}
 
     def reverse_dict(self) -> dict:

diff --git a/matrix_utils/mapped_matrix.py b/matrix_utils/mapped_matrix.py
@@ -137,12 +137,12 @@ def __init__(
         col_indices = safe_concatenate_indices([obj.col_matrix for obj in self.groups], empty_ok)
 
         if diagonal:
-            x = int(self.row_mapper.index_array.max() + 1)
+            x = 0 if self.row_mapper.empty_input else self.row_mapper.max_index + 1
             dimensions = (x, x)
         else:
             dimensions = (
-                int(self.row_mapper.index_array.max() + 1),
-                int(self.col_mapper.index_array.max() + 1),
+                0 if self.row_mapper.empty_input else self.row_mapper.max_index + 1,
+                0 if self.col_mapper.empty_input else self.col_mapper.max_index + 1,
             )
 
         self.matrix = sparse.coo_matrix(

diff --git a/tests/array_mapper.py b/tests/array_mapper.py
@@ -12,9 +12,17 @@ def test_initial_setup():
     assert np.allclose(am.map_array(inpt), expected)
 
 
+def test_with_large_values():
+    inpt = np.array([1288834974657, 2288834974657, 3488834974657, 3288834974657])
+    am = ArrayMapper(array=inpt)
+    given = np.array([1288834974657, 228883474657, 3288834974657, 3488834974657])
+    expected = np.array([0, -1, 2, 3])
+    assert np.allclose(am.map_array(given), expected)
+
+
 def test_float_indices_raises_error():
     inpt = np.array([1, 2, 3, 6.0])
-    with pytest.raises(IndexError):
+    with pytest.raises(TypeError):
         ArrayMapper(array=inpt)
 
 
@@ -69,4 +77,4 @@ def test_empty_array_error():
 def test_empty_array_ok():
     am = ArrayMapper(array=np.array([], dtype=int), empty_ok=True)
     assert am.array.shape == (0,)
-    assert am.index_array.shape == (1,)
+    assert am.empty_input
diff --git a/tests/integration.py b/tests/integration.py
@@ -100,8 +100,6 @@ def test_matrix_construction_overlapping_sum(smaller):
     col = np.array([0, 1, 2, 3, 5, 4])
     data = np.array([12, 16.3, 4, 25, 12.3, 125])
     matrix = mm.matrix.tocoo()
-    print(matrix)
-    print(matrix.data)
     assert np.allclose(matrix.row, row)
     assert np.allclose(matrix.col, col)
     assert np.allclose(matrix.data, data)

diff --git a/tests/mapped_matrix.py b/tests/mapped_matrix.py
@@ -736,7 +736,6 @@ def test_input_index_vector(sensitivity_dps):
         (942484272, 942484272, 1, 1),
     ]
     for row in expected:
-        print(mm.input_indexer_vector())
         assert np.allclose(mm.input_indexer_vector(), row)
         try:
             next(mm)

diff --git a/tests/monte_carlo.py b/tests/monte_carlo.py
@@ -170,8 +170,6 @@ def test_distributions_reproducible():
         results[:, :, i] = mm.matrix.toarray()
 
     given = results.sum(axis=0).sum(axis=0).ravel()
-    print(given.shape)
-    print(given)
     expected = np.array(
         [
             21.06909828,

diff --git a/tests/ordering.py b/tests/ordering.py
@@ -21,7 +21,6 @@ def test_ordering():
     ]
     for dp in dps:
         dp.rehydrate_interface("w-fourth", Interface())
-        print(list(dp.groups))
 
     mm = MappedMatrix(packages=dps, matrix="matrix-a")
     assert [grp.label for grp in mm.groups] == [