Merge pull request #3 from esrel/dev

nan softmax, etc.
esrel · Jan 17, 2024 · 7541e31 · 7541e31
2 parents 0d0ccac + f065c1b
commit 7541e31
Show file tree

Hide file tree

Showing 8 changed files with 117 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,24 @@
 # ChangeLog
 
+## 0.1.4
+
+2024.01.12
+
+- added `cutoff` threshold to `fuse`
+- added `scaled` kwarg to `fuse` to apply `softmax` to fused vectors
+
+## 0.1.3
+
+2024.01.09
+
+- added `softmax` with `nan` support to `utils.py`
+
 ## 0.1.2
 
 2024.01.07
 
 - added vectorization from dict of scores
-- added priority fusion
+- added `priority` fusion
 
 ## 0.1.1
 
@@ -16,9 +29,7 @@
 
 ## 0.1.0
 
-2024.01.04
-
-Initial release.
+2024.01.04: initial release
 
 - `basic`, `voter` and `borda` (simple and tournament-style) decision fusion methods.
 - support function for vector scaling (`scale`).

diff --git a/README.md b/README.md
@@ -39,6 +39,26 @@ where predictors (classifiers) may have different label spaces.
 Consequently, the library makes distinction between classes predicted with a low score (`0.0`)
 and not predicted classes (`nan`).
 
+## Vectorization
+
+eFusor provides a `vectorize` function to do the vectorization 
+making distinction between predicted and not predicted classes.
+The function expects a `list` of class labels 
+and a `dict` of prediction scores.
+
+```python
+from efusor import vectorize
+
+labels = ["A", "B", "C", "D"]
+scores = {"A": 0.75, "B": 0.25, "C": 0.00}
+
+vector = vectorize(labels, scores)
+# array([0.75, 0.25, 0.  ,  nan])
+```
+
+The function supports scores input as a vector, a matrix or a tensor.
+That is a dict, a list of dicts or a list of lists of dicts.
+
 ## Fusion Methods
 
 ### Basic Fusion Methods

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@ def read(path):
 setup(
     name='efusor',
     url='https://github.com/esrel/efusor',
-    version='0.1.2',
+    version='0.1.4',
     author='Evgeny A. Stepanov',
     author_email='stepanov.evgeny.a@gmail.com',
     description='Extended Decision Fusion',

diff --git a/src/efusor/fusor.py b/src/efusor/fusor.py
@@ -12,11 +12,15 @@
 from efusor.basic import apply
 from efusor.borda import borda
 from efusor.priority import prioritize
+from efusor.utils import softmax
 
 
 def fuse(tensor: list | np.ndarray,
          method: str = "hard_voting",
          weights: list | np.ndarray = None,
+         *,
+         cutoff: float = None,
+         scaled: bool = False,
          digits: int = None
          ) -> list:
     """
@@ -27,12 +31,20 @@ def fuse(tensor: list | np.ndarray,
     :type method: str, optional
     :param weights: predictor weights; defaults to None
     :type weights: np.ndarray, optional
+    :param cutoff: prediction cut-off threshold; defaults to None
+    :type cutoff: float, optional
+    :param scaled: if to re-scale final scores (softmax); defaults to False
+    :type scaled: bool, optional
     :param digits: rounding precision; defaults to None
     :type digits: int, optional
     :return: fused scores
     :rtype: np.ndarray
     """
     tensor = np.array(tensor) if isinstance(tensor, list) else tensor
+
+    if cutoff:
+        tensor[tensor < cutoff] = np.nan
+
     weights = np.array(weights) if isinstance(weights, list) else weights
 
     if method in {"hard_voting", "soft_voting", "majority_voting"}:
@@ -48,6 +60,7 @@ def fuse(tensor: list | np.ndarray,
     else:
         raise ValueError(f"unsupported fusion method: {method}")
 
+    result = np.apply_along_axis(softmax, -1, result) if scaled else result
     result = np.round(result,  decimals=digits) if digits else result
 
     return result.tolist()
diff --git a/src/efusor/scaler.py b/src/efusor/scaler.py
@@ -20,7 +20,10 @@ def scale(vector: np.ndarray) -> np.ndarray:
     :return: vector
     :rtype: np.ndarray
     """
-    if not vector.any() or vector.min() == vector.max():
+    if np.isnan(vector).all():
         return vector
 
-    return (vector - np.min(vector)) / (np.max(vector) - np.min(vector))
+    if np.nanmin(vector) == np.nanmax(vector):
+        return vector
+
+    return (vector - np.nanmin(vector)) / (np.nanmax(vector) - np.nanmin(vector))
diff --git a/src/efusor/utils.py b/src/efusor/utils.py
@@ -32,3 +32,14 @@ def batch(*vector: np.ndarray) -> np.ndarray:
     :rtype: np.ndarray
     """
     return np.stack(vector)
+
+
+def softmax(vector: np.ndarray) -> np.ndarray:
+    """
+    numerically stable softmax with nan support
+    :param vector: predictions scores (not probability)
+    :type vector: np.ndarray
+    :return: softmax
+    :rtype: np.ndarray
+    """
+    return np.exp(vector - np.nanmax(vector))/np.nansum(np.exp(vector - np.nanmax(vector)))
diff --git a/tests/test_fusor.py b/tests/test_fusor.py
@@ -63,3 +63,44 @@ def test_fuse_scalar(scores: list, weights: list) -> None:
             for scalar in vector:
                 with pytest.raises(IndexError):
                     fuse(np.array(scalar), method="hard_voting", weights=np.array(weights))
+
+
+def test_fusor_cutoff(scores: list) -> None:
+    """
+    test cutoff
+    :param scores: prediction scores
+    :type scores: list
+    """
+    cutoff_max = [[0.7, 0.3, 0.5],
+                  [0.4, 0.4, 0.6],
+                  [0.4, 0.7, np.nan],
+                  [0.3, 0.3, 1.0],
+                  [np.nan, np.nan, np.nan]]
+
+    result = fuse(np.array(scores), method="max", cutoff=0.1)
+    assert np.array_equal(np.array(result), np.array(cutoff_max), equal_nan=True)
+
+
+def test_fusor_scaled(scores: list) -> None:
+    """
+    test softmax
+    :param scores: prediction scores
+    :type scores: list
+    """
+    scaled_max = [[0.4, 0.27, 0.33],
+                  [0.31, 0.31, 0.38],
+                  [0.33, 0.45, 0.22],
+                  [0.25, 0.25, 0.5],
+                  [0.33, 0.33, 0.33]]
+
+    cutoff_max = [[0.4, 0.27, 0.33],
+                  [0.31, 0.31, 0.38],
+                  [0.43, 0.57, np.nan],
+                  [0.25, 0.25, 0.5],
+                  [np.nan, np.nan, np.nan]]
+
+    result = fuse(np.array(scores), method="max", scaled=True, digits=2)
+    assert np.array_equal(np.array(result), np.array(scaled_max), equal_nan=True)
+
+    result = fuse(np.array(scores), method="max", cutoff=0.1, scaled=True, digits=2)
+    assert np.array_equal(np.array(result), np.array(cutoff_max), equal_nan=True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from efusor.utils import batch, vectorize
+from efusor.utils import batch, vectorize, softmax
 
 
 def test_batch_tensor(scores: list) -> None:
@@ -89,3 +89,13 @@ def test_vectorize_vector(scores: list) -> None:
         for j, vector in enumerate(matrix):
             assert np.array_equal(vectorize(labels, vector),
                                   np.array(scores[i][j]), equal_nan=True)
+
+
+def test_softmax() -> None:
+    """ test softmax """
+    # test normal
+    assert softmax(np.array([0.25, -0.25, 0.0])).sum() == 1.0
+    # test large number
+    assert softmax(np.array([999, 100, 0.0])).sum() == 1.0
+    # test nan support
+    assert np.nansum(softmax(np.array([0.75, 0.25, 0.0, np.nan]))) == 1.0