Feature/main score is kminus1 (#16)

* swap names * update * update docs * refactor tests * refactor tests * startt version from 0-minor * ignore type
letiziaia · Jul 27, 2024 · 32d6838 · 32d6838
1 parent 5eac2a8
commit 32d6838
Show file tree

Hide file tree

Showing 12 changed files with 407 additions and 146 deletions.
diff --git a/README.md b/README.md
@@ -54,4 +54,4 @@ Additional issues can be found by running `python3 -m flake8 .` and `python3 -m
 
 ## Tests
 
-This code has test coverage for python 3.10, 3.11, and 3.12.
+This code is built and tested for python 3.10, 3.11, and 3.12.
diff --git a/RUNBOOK.md b/RUNBOOK.md
@@ -70,6 +70,8 @@ $ python -m unittest discover -v
 
 ### Given opinion partitions for each of the topics, compute the multiway alignment score of all of them
 
+#### 1. Perfect Alignment
+
 ```python
 # import needed libraries
 >>> import pandas as pd
@@ -81,33 +83,126 @@ $ python -m unittest discover -v
     {
         # on topic A, individuals 0 and 1 have opinion 0,
         # individuals 2 and 3 have opinion 1
-        "A": [0, 0, 1, 1],
-        "B": [0, 1, 0, 1],
-        "C": [1, 0, 1, 0]
+        "A": [0, 0, 0, 0, 1, 1, 1, 1],
+        "B": [0, 0, 0, 0, 1, 1, 1, 1],
+        "C": [1, 1, 1, 1, 0, 0, 0, 0],
     }
 )
 
-# get list of labels for the consensus partition
->>> partition_labels = mac.get_consensus_labels(opinions=df)
+
+# compute 3-way alignment score using NMI (normalized mutual info score)
+>>> mas.multiway_alignment_score(
+...     df, which_score="nmi", adjusted=False,
+... )
+1.0
+
+# compute 3-way alignment score using NMI (normalized mutual info score)
+# and adjust with the null model
+>>> mas.multiway_alignment_score(
+...     df, which_score="nmi", adjusted=True,
+... )
+0.8767167706710732
+
+# compute 3-way alignment score using AMI (adjusted mutual info score)
+>>> mas.multiway_alignment_score(
+...     df, which_score="ami", adjusted=False,
+... )
+1.0
 
 # compute 3-way alignment score using AMI (adjusted mutual info score)
 # and adjust with the null model
 >>> mas.multiway_alignment_score(
-...     df, partition_labels, which_score="ami", adjusted=True,
+...     df, which_score="ami", adjusted=True,
+... )
+0.933281539775369
+```
+
+In this example, we computed multiway alignment for a perfectly aligned system of 8 individuals and 3 topics.
+Both the multiway alignment scores obtained by using NMI and by using AMI give 1 (perfect alignment). However, in case we are dealing with a sample of the population, we might want to account for the number of individuals, that here is quite small. To do so, we can adjust the scores with the null model. The resulting alignment score is still quite high, but accounts for the fact that this perfect alignment we are seeing among 8 individuals might be arising by chance.
+
+With a growing number of individuals, the effect of alignment arising from random chance is smaller, and the score does not overfit:
+
+```python
+>>> n_individuals = 1000
+>>> df = pd.DataFrame(
+    {
+        "A": [0] * int(n_individuals/2) + [1] * int(n_individuals/2),
+        "B": [0] * int(n_individuals/2) + [1] * int(n_individuals/2),
+        "C": [0] * int(n_individuals/2) + [1] * int(n_individuals/2),
+    }
+)
+
+
+# compute 3-way alignment score using NMI (normalized mutual info score)
+>>> mas.multiway_alignment_score(
+...     df, which_score="nmi", adjusted=False,
 ... )
-6.40685300762983e-16
+1.0
 
 # compute 3-way alignment score using NMI (normalized mutual info score)
 # and adjust with the null model
 >>> mas.multiway_alignment_score(
-...     df, partition_labels, which_score="nmi", adjusted=True,
+...     df, which_score="nmi", adjusted=True,
 ... )
-0.0
+0.9991381194997214
 
-# if we use NMI (normalized mutual info score) without adjusting it
-# with a null model, the resulting score is inflated
+# compute 3-way alignment score using AMI (adjusted mutual info score)
 >>> mas.multiway_alignment_score(
-...     df, partition_labels, which_score="nmi", adjusted=False,
+...     df, which_score="ami", adjusted=False,
 ... )
-0.6666666666666666
+1.0
+
+# compute 3-way alignment score using AMI (adjusted mutual info score)
+# and adjust with the null model
+>>> mas.multiway_alignment_score(
+...     df, which_score="ami", adjusted=True,
+... )
+0.9998316111697388
+```
+
+#### 2. No Alignment
+
+```python
+>>> n_individuals = 10000
+>>> opinions = np.array([0] * int(n_individuals/2) + [1] * int(n_individuals/2))
+>>> o1 = opinions.copy()
+>>> np.random.shuffle(opinions)
+>>> o2 = opinions.copy()
+>>> np.random.shuffle(opinions)
+>>> o3 = opinions.copy()
+>>> df = pd.DataFrame(
+    {
+        "A": o1,
+        "B": o2,
+        "C": o3,
+    }
+)
+
+# compute 3-way alignment score using NMI (normalized mutual info score)
+>>> mas.multiway_alignment_score(
+...     df, which_score="nmi", adjusted=False,
+... )
+0.0002596921754934203
+
+# compute 3-way alignment score using NMI (normalized mutual info score)
+# and adjust with the null model
+>>> mas.multiway_alignment_score(
+...     df, which_score="nmi", adjusted=True,
+... )
+0.00014352480953112452
+
+# compute 3-way alignment score using AMI (adjusted mutual info score)
+>>> mas.multiway_alignment_score(
+...     df, which_score="ami", adjusted=False,
+... )
+0.00011540052022510332
+
+# compute 3-way alignment score using AMI (adjusted mutual info score)
+# and adjust with the null model
+>>> mas.multiway_alignment_score(
+...     df, which_score="ami", adjusted=True,
+... )
+5.4181170472338464e-05
 ```
+
+For a random system, multiway alignment score approaches 0.
diff --git a/multiway_alignment/data/anes.py b/multiway_alignment/data/anes.py
@@ -94,7 +94,7 @@ def compute_all_alignments(timeseries: pd.DataFrame) -> None:
             .reset_index(drop=True)
         )
         dump_name = f"survey_{year}"
-        full, _ = mw_score.maximal_alignment_curve_nminusone(
+        full, _ = mw_score.maximal_alignment_curve(
             opinions=_df,
             which_score="ami",
             adjusted=False,
@@ -103,7 +103,7 @@ def compute_all_alignments(timeseries: pd.DataFrame) -> None:
         dump(full, dump_name + "_nminus1_ami_full")
 
         print("null model")
-        mw_null.random_full_alignment_curves_kminusone(
+        mw_null.random_full_alignment_curves(
             df=_df,
             save_to=dump_name + "_nminus1_null",
             which_score="ami",

diff --git a/multiway_alignment/null_models.py b/multiway_alignment/null_models.py
@@ -31,7 +31,7 @@ def get_null_model(opinions: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
     return null
 
 
-def _one_iter(
+def _one_iter_fullpartition(
     opinions: Union[pd.DataFrame, pd.Series],
     which_score: str = "ami",
     adjusted: bool = False,
@@ -45,13 +45,13 @@ def _one_iter(
     """
     null = get_null_model(opinions=opinions)
 
-    _full_res, _ = ma_score.maximal_alignment_curve(
+    _full_res, _ = ma_score.maximal_alignment_curve_fullpartition(
         null, which_score=which_score, adjusted=adjusted
     )
     return _full_res
 
 
-def _one_iter_kminusone(
+def _one_iter(
     opinions: Union[pd.DataFrame, pd.Series],
     which_score: str = "ami",
     adjusted: bool = False,
@@ -65,13 +65,13 @@ def _one_iter_kminusone(
     """
     null = get_null_model(opinions=opinions)
 
-    _full_res, _ = ma_score.maximal_alignment_curve_nminusone(
+    _full_res, _ = ma_score.maximal_alignment_curve(
         null, which_score=which_score, adjusted=adjusted
     )
     return _full_res
 
 
-def random_full_alignment_curves(
+def random_full_alignment_curves_fullpartition(
     df: pd.DataFrame,
     save_to: str,
     which_score: str = "ami",
@@ -97,7 +97,10 @@ def random_full_alignment_curves(
         logger.info(f"Created new directory {save_to}")
     with Pool(processes=mp.cpu_count() - 1) as pool:
         result = pool.map_async(
-            partial(_one_iter, **{"which_score": which_score, "adjusted": adjusted}),
+            partial(
+                _one_iter_fullpartition,
+                **{"which_score": which_score, "adjusted": adjusted},  # type: ignore
+            ),
             [df.copy()] * n_tries,
         )
         i = 0
@@ -106,7 +109,7 @@ def random_full_alignment_curves(
             i += 1
 
 
-def random_full_alignment_curves_kminusone(
+def random_full_alignment_curves(
     df: pd.DataFrame,
     save_to: str,
     which_score: str = "ami",
@@ -133,8 +136,8 @@ def random_full_alignment_curves_kminusone(
     with Pool(processes=mp.cpu_count() - 1) as pool:
         result = pool.map_async(
             partial(
-                _one_iter_kminusone,
-                **{"which_score": which_score, "adjusted": adjusted},
+                _one_iter,
+                **{"which_score": which_score, "adjusted": adjusted},  # type: ignore
             ),
             [df.copy()] * n_tries,
         )
@@ -144,7 +147,9 @@ def random_full_alignment_curves_kminusone(
             i += 1
 
 
-def expected_curve(opinions: Union[pd.DataFrame, pd.Series]) -> List[float]:
+def expected_curve_fullpartition(
+    opinions: Union[pd.DataFrame, pd.Series]
+) -> List[float]:
     """
     :param opinions: pd.DataFrame having one column per layer and one row per node,
         where each element a_ij is an integer representing the cluster labels for node i at layer j
@@ -181,5 +186,5 @@ def expected_curve(opinions: Union[pd.DataFrame, pd.Series]) -> List[float]:
     return _expected_best_scores
 
 
-def expected_curve_equal_sized_clusters(n_layers: int) -> List[float]:
+def expected_curve_fullpartition_equal_sized_clusters(n_layers: int) -> List[float]:
     return [2 / (1 + k) for k in range(2, n_layers + 1)]
diff --git a/multiway_alignment/score.py b/multiway_alignment/score.py
@@ -37,7 +37,7 @@ def _layer_expectation(
     return np.array(_all_scores).mean()
 
 
-def multiway_alignment_score_nminusone(
+def multiway_alignment_score(
     opinions: typing.Union[pd.DataFrame, pd.Series],
     which_score: str = "nmi",
     adjusted: bool = False,
@@ -76,7 +76,7 @@ def multiway_alignment_score_nminusone(
     return (avg_nmi - _expected_nmi) / len(opinions.columns)
 
 
-def multiway_alignment_score(
+def multiway_alignment_score_fullpartition(
     opinions: typing.Union[pd.DataFrame, pd.Series],
     mutual_clusters_labels: typing.List,
     which_score: str = "nmi",
@@ -114,7 +114,7 @@ def multiway_alignment_score(
     return (avg_nmi - _expected_nmi) / len(opinions.columns)
 
 
-def maximal_alignment_curve_nminusone(
+def maximal_alignment_curve(
     opinions: typing.Union[pd.DataFrame, pd.Series],
     which_score: str = "nmi",
     adjusted: bool = False,
@@ -161,7 +161,7 @@ def maximal_alignment_curve_nminusone(
             l_comb_df.reset_index(drop=True, inplace=True)
 
             # CRITERIA
-            nmi = multiway_alignment_score_nminusone(
+            nmi = multiway_alignment_score(
                 l_comb_df, which_score=which_score, adjusted=adjusted
             )
 
@@ -189,7 +189,7 @@ def maximal_alignment_curve_nminusone(
     return all_scores_by_combination_size, best_by_combination_size
 
 
-def maximal_alignment_curve(
+def maximal_alignment_curve_fullpartition(
     opinions: typing.Union[pd.DataFrame, pd.Series],
     which_score: str = "nmi",
     adjusted: bool = False,
@@ -239,7 +239,7 @@ def maximal_alignment_curve(
             labels_list = get_consensus_labels(opinions=l_comb_df)
 
             # CRITERIA
-            nmi = multiway_alignment_score(
+            nmi = multiway_alignment_score_fullpartition(
                 l_comb_df, labels_list, which_score=which_score, adjusted=adjusted
             )
 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="multiway_alignment",
-    version="0.1.0",
+    version="0.0.1",
     packages=find_packages(),
     author="Letizia Iannucci",
     author_email="letizia.iannucci@aalto.fi",

diff --git a/tests/test_compute_maximal_alignment_curve.py b/tests/test_compute_maximal_alignment_curve.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -54,4 +54,4 @@ Additional issues can be found by running `python3 -m flake8 .` and `python3 -m

		## Tests

		This code has test coverage for python 3.10, 3.11, and 3.12.
		This code is built and tested for python 3.10, 3.11, and 3.12.