Refactor/rename mutualcl (#3)

* refactor * fix typing * update tests after refactoring * add mypy * add stubs * refactor: naming and types * default lint config * unused import * typing and linting * untyped import * revert to same pipfile * fix
letiziaia · Mar 8, 2024 · 859e465 · 859e465
1 parent 9a90ae1
commit 859e465
Show file tree

Hide file tree

Showing 17 changed files with 442 additions and 2,853 deletions.
diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml
@@ -22,6 +22,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install pipenv
+          pipenv lock
           pipenv verify
           pipenv install --dev
       - name: Validate

diff --git a/Pipfile b/Pipfile
@@ -4,34 +4,33 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-click = ">=8.0"
-clusim = "*"
-cython = "*"
+click = "*"
 joblib = "~=1.2.0"
-loguru = "~=0.6.0"
+loguru = "*"
 matplotlib = "~=3.6.1"
 networkx = "*"
-numba = "*"
-numpy = "~=1.24.0"
+numpy = "*"
 pandas = "==1.3.5"
-scikit-learn = "==1.0.2"
+scikit-learn = "~=1.4.1"
 scipy = "*"
 seaborn = "*"
 setuptools = "*"
-regex = "*"
 tqdm = "*"
 wheel = "*"
 
 [dev-packages]
 black = "*"
 coverage = "*"
 flake8 = "*"
+mypy = "*"
 notebook = "*"
-pip-audit = ">=2.4.10"
+pandas-stubs = "*"
+pip-audit = "*"
 ruff = "*"
+types-tqdm = "*"
 
 [scripts]
-validate = "bash -c 'python3 -m flake8 && python3 -m pip_audit'"
+validate = "bash -c 'python3 -m flake8 && python3 -m mypy . && python3 -m pip_audit'"
 
 [requires]
-python_version = "3.10"
+
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -5,15 +5,14 @@
 
 # multilayer-alignment
 
-This repository implements an algorithm for quantifying multilayer alignment.
-You can refer to the [slide deck](https://docs.google.com/presentation/d/1HMEE5kOwwJPLBmAgycKIMSWRx0eCxd3RtSxVR1Jdczw/) for the general idea.
+This repository implements an algorithm for quantifying multilayer alignment or higher-order alignment, that is, the alignment across n different dimensions. You can refer to the [slide deck](https://docs.google.com/presentation/d/1HMEE5kOwwJPLBmAgycKIMSWRx0eCxd3RtSxVR1Jdczw/) for the original idea.
 
 ## Structure of the repo
 
 - `\multilayer_alignment\`: source code
 - `\tests\`: tests for the source code
 
-## Setting up
+## Setting up the development environment
 
 ![python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
 

diff --git a/RUNBOOK.md b/RUNBOOK.md
@@ -2,10 +2,10 @@
 
 ## Overview
 
-This repository contains code to extend pairwise measure of alignment
-to N-wise case.
+This repository contains code to extend a pairwise measure of alignment
+based on mutual information to an N-wise case.
 
-## Operational tasks
+## Development tasks
 
 ### Install all dependencies and activate the environment
 
@@ -25,26 +25,26 @@ From root directory,
 $ python -m unittest discover -v
 ```
 
-### Given partitions for each of the layers, compute mutual clusters
+### Given opinion partitions for each of the topics, compute the consensus partition
 
 ```python
 # import needed modules
 >>> import pandas as pd
->>> from multilayer_alignment.mutual_clusters import compute_mutual_clusters_recursive
+>>> from multilayer_alignment.consensus import get_consensus_partition
 
-# load the partitions labels to a pandas DataFrame
+# load the opinion labels to a pandas DataFrame
 >>> df = pd.DataFrame(
     {
-        # in layer A, nodes 0 and 1 have label 0,
-        # nodes 2 and 3 have label 1
+        # on topic A, individuals 0 and 1 have opinion 0,
+        # individuals 2 and 3 have opinion 1
         "A": [0, 0, 1, 1],
         "B": [0, 1, 0, 1],
         "C": [1, 0, 1, 0]
     }
 )
 
-# get mutual clusters
->>> compute_mutual_clusters_recursive(cluster_labels_df=df)
+# get consensus partition
+>>> get_consensus_partition(opinions=df)
 {
     "A0_B0_C1": {0},
     "A0_B1_C0": {1},
@@ -58,21 +58,21 @@ Alternatively:
 ```python
 # import needed modules
 >>> import pandas as pd
->>> from multilayer_alignment.mutual_clusters import compute_mutual_clusters
+>>> from multilayer_alignment.consensus import get_consensus_partition_recursive
 
 # load the partitions labels to a pandas DataFrame
 >>> df = pd.DataFrame(
     {
-        # in layer A, nodes 0 and 1 have label 0,
-        # nodes 2 and 3 have label 1
+        # on topic A, individuals 0 and 1 have opinion 0,
+        # individuals 2 and 3 have opinion 1
         "A": [0, 0, 1, 1],
         "B": [0, 1, 0, 1],
         "C": [1, 0, 1, 0]
     }
 )
 
-# get mutual clusters
->>> compute_mutual_clusters(cluster_labels_df=df)
+# get consensus partition
+>>> get_consensus_partition_recursive(opinions=df)
 {
     "A0_B0_C1": {0},
     "A0_B1_C0": {1},

diff --git a/multilayer_alignment/alignment_score.py b/multilayer_alignment/alignment_score.py
@@ -3,23 +3,23 @@
 import numpy as np
 from itertools import combinations
 from functools import partial
-from joblib import dump
+from joblib import dump  # type: ignore
 
-from sklearn.metrics.cluster import normalized_mutual_info_score
-from sklearn.metrics import adjusted_mutual_info_score
+from sklearn.metrics.cluster import normalized_mutual_info_score  # type: ignore
+from sklearn.metrics import adjusted_mutual_info_score  # type: ignore
 
 import multiprocessing as mp
 from multiprocessing.pool import Pool
 from tqdm import tqdm
 
-from multilayer_alignment.mutual_clusters import compute_mutual_clusters
-from multilayer_alignment.mutual_clusters import get_mutual_clusters_labels
+from multilayer_alignment.consensus import get_consensus_partition
+from multilayer_alignment.consensus import get_consensus_labels
 
 from multilayer_alignment.utils.logging import logger
 
 
 def _compute_layer_expectation(
-    layer: np.array, scoring_function: typing.Callable
+    layer: typing.Iterable, scoring_function: typing.Callable
 ) -> float:
     """
     :param layer: 1d np.array with clustering assignment
@@ -29,22 +29,23 @@ def _compute_layer_expectation(
     _all_scores = []
     with Pool(processes=mp.cpu_count() - 1) as pool:
         result = pool.map_async(
-            scoring_function, [np.random.permutation(layer) for _ in range(10)]
-        )
+            scoring_function, [np.random.permutation(layer) for _ in range(10)]  # type: ignore
+        )  # type: ignore
         for value in result.get():
-            # NOTE: assuming scores are always >=0
+            # NOTE: in case of AMI, it is possible to get negative scores,
+            # but we cap them to 0 so get only scores >= 0
             _all_scores.append(max(value, 0))
     return np.array(_all_scores).mean()
 
 
-def compute_multilayer_alignment_score(
-    cluster_labels_df: pd.DataFrame,
+def multilayer_alignment_score(
+    opinions: typing.Union[pd.DataFrame, pd.Series],
     mutual_clusters_labels: typing.List,
     which_score: str = "nmi",
     adjusted: bool = False,
 ) -> float:
     """
-    :param cluster_labels_df: pd.DataFrame having one column per layer and one row per node,
+    :param opinions: pd.DataFrame having one column per layer and one row per node,
         where each element a_ij is an integer representing the cluster labels for node i at layer j
     :param mutual_clusters_labels: list, a list of labels for mutual clusters
     :param which_score: str, one of "nmi" or "ami"
@@ -59,9 +60,9 @@ def compute_multilayer_alignment_score(
         _score_f = adjusted_mutual_info_score
 
     avg_nmi = 0
-    _expected_nmi = 0
-    for layer_id in cluster_labels_df.columns:
-        _layer = cluster_labels_df[layer_id].values
+    _expected_nmi = 0.0
+    for layer_id in opinions.columns:
+        _layer = opinions[layer_id].values
         _score = _score_f(_layer, mutual_clusters_labels, average_method="arithmetic")
         avg_nmi += _score
 
@@ -72,17 +73,17 @@ def compute_multilayer_alignment_score(
                     _score_f, mutual_clusters_labels, **{"average_method": "arithmetic"}
                 ),
             )
-    return (avg_nmi - _expected_nmi) / len(cluster_labels_df.columns)
+    return (avg_nmi - _expected_nmi) / len(opinions.columns)
 
 
-def compute_maximal_alignment_curve(
-    cluster_labels_df: pd.DataFrame,
+def maximal_alignment_curve(
+    opinions: typing.Union[pd.DataFrame, pd.Series],
     which_score: str = "nmi",
     adjusted: bool = False,
     dump_to: typing.Optional[str] = None,
 ) -> typing.Tuple:
     """
-    :param cluster_labels_df: pd.DataFrame having one column per layer and one row per node,
+    :param opinions: pd.DataFrame having one column per layer and one row per node,
         where each element a_ij is an integer representing the cluster labels for node i at layer j
     :param which_score: str, one of "nmi" or "ami"
     :param adjusted: bool, default: False
@@ -103,39 +104,39 @@ def compute_maximal_alignment_curve(
 
     best_by_combination_size = dict()
     all_scores_by_combination_size = dict()
-    _num_of_layers = len(cluster_labels_df.columns)
+    _num_of_layers = len(opinions.columns)
     # skipping size 1
     for length in range(2, _num_of_layers + 1):
         logger.info(f"combinations of size {length}")
-        # Get all combinations of cluster_labels_df.columns of length "length"
-        _columns_combinations = combinations(cluster_labels_df.columns, length)
+        # Get all combinations of opinions.columns of length "length"
+        _columns_combinations = combinations(opinions.columns, length)
 
         best_layers_combination = None
-        best_nmi = 0
+        best_nmi = 0.0
         # best_layers_combination_mutual_communities = dict()
 
-        for l_comb in tqdm(_columns_combinations):
-            l_comb = list(l_comb)
-            l_comb_df = cluster_labels_df[l_comb].copy()
+        for _l_comb in tqdm(_columns_combinations):
+            l_comb = list(_l_comb)
+            l_comb_df = opinions[l_comb].copy()
             # keep only items that have labels for all items in l_comb and reindex
             l_comb_df.dropna(inplace=True)
             l_comb_df.reset_index(drop=True, inplace=True)
-            mutual_clusters = compute_mutual_clusters(l_comb_df)
-            mutual_clusters_labels = get_mutual_clusters_labels(mutual_clusters)
+            mutual_clusters = get_consensus_partition(l_comb_df)
+            mutual_clusters_labels = get_consensus_labels(mutual_clusters)
             labels_list = (
                 mutual_clusters_labels.set_index("id")
                 .iloc[l_comb_df.index]["label"]
-                .values
+                .to_list()
             )
 
             # CRITERIA
-            nmi = compute_multilayer_alignment_score(
+            nmi = multilayer_alignment_score(
                 l_comb_df, labels_list, which_score=which_score, adjusted=adjusted
             )
 
-            all_scores_by_combination_size[
-                f"{length}+" + "+".join(sorted(l_comb))
-            ] = nmi
+            all_scores_by_combination_size[f"{length}+" + "+".join(sorted(l_comb))] = (
+                nmi
+            )
             # (
             # nmi,
             # mutual_clusters,