Skip to content

Commit

Permalink
Refactor/rename mutualcl (#3)
Browse files Browse the repository at this point in the history
* refactor

* fix typing

* update tests after refactoring

* add mypy

* add stubs

* refactor: naming and types

* default lint config

* unused import

* typing and linting

* untyped import

* revert to same pipfile

* fix
  • Loading branch information
letiziaia authored Mar 8, 2024
1 parent 9a90ae1 commit 859e465
Show file tree
Hide file tree
Showing 17 changed files with 442 additions and 2,853 deletions.
1 change: 1 addition & 0 deletions .github/workflows/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pipenv
pipenv lock
pipenv verify
pipenv install --dev
- name: Validate
Expand Down
21 changes: 10 additions & 11 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,33 @@ verify_ssl = true
name = "pypi"

[packages]
click = ">=8.0"
clusim = "*"
cython = "*"
click = "*"
joblib = "~=1.2.0"
loguru = "~=0.6.0"
loguru = "*"
matplotlib = "~=3.6.1"
networkx = "*"
numba = "*"
numpy = "~=1.24.0"
numpy = "*"
pandas = "==1.3.5"
scikit-learn = "==1.0.2"
scikit-learn = "~=1.4.1"
scipy = "*"
seaborn = "*"
setuptools = "*"
regex = "*"
tqdm = "*"
wheel = "*"

[dev-packages]
black = "*"
coverage = "*"
flake8 = "*"
mypy = "*"
notebook = "*"
pip-audit = ">=2.4.10"
pandas-stubs = "*"
pip-audit = "*"
ruff = "*"
types-tqdm = "*"

[scripts]
validate = "bash -c 'python3 -m flake8 && python3 -m pip_audit'"
validate = "bash -c 'python3 -m flake8 && python3 -m mypy . && python3 -m pip_audit'"

[requires]
python_version = "3.10"

2,495 changes: 0 additions & 2,495 deletions Pipfile.lock

This file was deleted.

5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@

# multilayer-alignment

This repository implements an algorithm for quantifying multilayer alignment.
You can refer to the [slide deck](https://docs.google.com/presentation/d/1HMEE5kOwwJPLBmAgycKIMSWRx0eCxd3RtSxVR1Jdczw/) for the general idea.
This repository implements an algorithm for quantifying multilayer alignment or higher-order alignment, that is, the alignment across n different dimensions. You can refer to the [slide deck](https://docs.google.com/presentation/d/1HMEE5kOwwJPLBmAgycKIMSWRx0eCxd3RtSxVR1Jdczw/) for the original idea.

## Structure of the repo

- `\multilayer_alignment\`: source code
- `\tests\`: tests for the source code

## Setting up
## Setting up the development environment

![python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)

Expand Down
30 changes: 15 additions & 15 deletions RUNBOOK.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

## Overview

This repository contains code to extend pairwise measure of alignment
to N-wise case.
This repository contains code to extend a pairwise measure of alignment
based on mutual information to an N-wise case.

## Operational tasks
## Development tasks

### Install all dependencies and activate the environment

Expand All @@ -25,26 +25,26 @@ From root directory,
$ python -m unittest discover -v
```

### Given partitions for each of the layers, compute mutual clusters
### Given opinion partitions for each of the topics, compute the consensus partition

```python
# import needed modules
>>> import pandas as pd
>>> from multilayer_alignment.mutual_clusters import compute_mutual_clusters_recursive
>>> from multilayer_alignment.consensus import get_consensus_partition

# load the partitions labels to a pandas DataFrame
# load the opinion labels to a pandas DataFrame
>>> df = pd.DataFrame(
{
# in layer A, nodes 0 and 1 have label 0,
# nodes 2 and 3 have label 1
# on topic A, individuals 0 and 1 have opinion 0,
# individuals 2 and 3 have opinion 1
"A": [0, 0, 1, 1],
"B": [0, 1, 0, 1],
"C": [1, 0, 1, 0]
}
)

# get mutual clusters
>>> compute_mutual_clusters_recursive(cluster_labels_df=df)
# get consensus partition
>>> get_consensus_partition(opinions=df)
{
"A0_B0_C1": {0},
"A0_B1_C0": {1},
Expand All @@ -58,21 +58,21 @@ Alternatively:
```python
# import needed modules
>>> import pandas as pd
>>> from multilayer_alignment.mutual_clusters import compute_mutual_clusters
>>> from multilayer_alignment.consensus import get_consensus_partition_recursive

# load the partitions labels to a pandas DataFrame
>>> df = pd.DataFrame(
{
# in layer A, nodes 0 and 1 have label 0,
# nodes 2 and 3 have label 1
# on topic A, individuals 0 and 1 have opinion 0,
# individuals 2 and 3 have opinion 1
"A": [0, 0, 1, 1],
"B": [0, 1, 0, 1],
"C": [1, 0, 1, 0]
}
)

# get mutual clusters
>>> compute_mutual_clusters(cluster_labels_df=df)
# get consensus partition
>>> get_consensus_partition_recursive(opinions=df)
{
"A0_B0_C1": {0},
"A0_B1_C0": {1},
Expand Down
67 changes: 34 additions & 33 deletions multilayer_alignment/alignment_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@
import numpy as np
from itertools import combinations
from functools import partial
from joblib import dump
from joblib import dump # type: ignore

from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics.cluster import normalized_mutual_info_score # type: ignore
from sklearn.metrics import adjusted_mutual_info_score # type: ignore

import multiprocessing as mp
from multiprocessing.pool import Pool
from tqdm import tqdm

from multilayer_alignment.mutual_clusters import compute_mutual_clusters
from multilayer_alignment.mutual_clusters import get_mutual_clusters_labels
from multilayer_alignment.consensus import get_consensus_partition
from multilayer_alignment.consensus import get_consensus_labels

from multilayer_alignment.utils.logging import logger


def _compute_layer_expectation(
layer: np.array, scoring_function: typing.Callable
layer: typing.Iterable, scoring_function: typing.Callable
) -> float:
"""
:param layer: 1d np.array with clustering assignment
Expand All @@ -29,22 +29,23 @@ def _compute_layer_expectation(
_all_scores = []
with Pool(processes=mp.cpu_count() - 1) as pool:
result = pool.map_async(
scoring_function, [np.random.permutation(layer) for _ in range(10)]
)
scoring_function, [np.random.permutation(layer) for _ in range(10)] # type: ignore
) # type: ignore
for value in result.get():
# NOTE: assuming scores are always >=0
# NOTE: in case of AMI, it is possible to get negative scores,
# but we cap them to 0 so get only scores >= 0
_all_scores.append(max(value, 0))
return np.array(_all_scores).mean()


def compute_multilayer_alignment_score(
cluster_labels_df: pd.DataFrame,
def multilayer_alignment_score(
opinions: typing.Union[pd.DataFrame, pd.Series],
mutual_clusters_labels: typing.List,
which_score: str = "nmi",
adjusted: bool = False,
) -> float:
"""
:param cluster_labels_df: pd.DataFrame having one column per layer and one row per node,
:param opinions: pd.DataFrame having one column per layer and one row per node,
where each element a_ij is an integer representing the cluster labels for node i at layer j
:param mutual_clusters_labels: list, a list of labels for mutual clusters
:param which_score: str, one of "nmi" or "ami"
Expand All @@ -59,9 +60,9 @@ def compute_multilayer_alignment_score(
_score_f = adjusted_mutual_info_score

avg_nmi = 0
_expected_nmi = 0
for layer_id in cluster_labels_df.columns:
_layer = cluster_labels_df[layer_id].values
_expected_nmi = 0.0
for layer_id in opinions.columns:
_layer = opinions[layer_id].values
_score = _score_f(_layer, mutual_clusters_labels, average_method="arithmetic")
avg_nmi += _score

Expand All @@ -72,17 +73,17 @@ def compute_multilayer_alignment_score(
_score_f, mutual_clusters_labels, **{"average_method": "arithmetic"}
),
)
return (avg_nmi - _expected_nmi) / len(cluster_labels_df.columns)
return (avg_nmi - _expected_nmi) / len(opinions.columns)


def compute_maximal_alignment_curve(
cluster_labels_df: pd.DataFrame,
def maximal_alignment_curve(
opinions: typing.Union[pd.DataFrame, pd.Series],
which_score: str = "nmi",
adjusted: bool = False,
dump_to: typing.Optional[str] = None,
) -> typing.Tuple:
"""
:param cluster_labels_df: pd.DataFrame having one column per layer and one row per node,
:param opinions: pd.DataFrame having one column per layer and one row per node,
where each element a_ij is an integer representing the cluster labels for node i at layer j
:param which_score: str, one of "nmi" or "ami"
:param adjusted: bool, default: False
Expand All @@ -103,39 +104,39 @@ def compute_maximal_alignment_curve(

best_by_combination_size = dict()
all_scores_by_combination_size = dict()
_num_of_layers = len(cluster_labels_df.columns)
_num_of_layers = len(opinions.columns)
# skipping size 1
for length in range(2, _num_of_layers + 1):
logger.info(f"combinations of size {length}")
# Get all combinations of cluster_labels_df.columns of length "length"
_columns_combinations = combinations(cluster_labels_df.columns, length)
# Get all combinations of opinions.columns of length "length"
_columns_combinations = combinations(opinions.columns, length)

best_layers_combination = None
best_nmi = 0
best_nmi = 0.0
# best_layers_combination_mutual_communities = dict()

for l_comb in tqdm(_columns_combinations):
l_comb = list(l_comb)
l_comb_df = cluster_labels_df[l_comb].copy()
for _l_comb in tqdm(_columns_combinations):
l_comb = list(_l_comb)
l_comb_df = opinions[l_comb].copy()
# keep only items that have labels for all items in l_comb and reindex
l_comb_df.dropna(inplace=True)
l_comb_df.reset_index(drop=True, inplace=True)
mutual_clusters = compute_mutual_clusters(l_comb_df)
mutual_clusters_labels = get_mutual_clusters_labels(mutual_clusters)
mutual_clusters = get_consensus_partition(l_comb_df)
mutual_clusters_labels = get_consensus_labels(mutual_clusters)
labels_list = (
mutual_clusters_labels.set_index("id")
.iloc[l_comb_df.index]["label"]
.values
.to_list()
)

# CRITERIA
nmi = compute_multilayer_alignment_score(
nmi = multilayer_alignment_score(
l_comb_df, labels_list, which_score=which_score, adjusted=adjusted
)

all_scores_by_combination_size[
f"{length}+" + "+".join(sorted(l_comb))
] = nmi
all_scores_by_combination_size[f"{length}+" + "+".join(sorted(l_comb))] = (
nmi
)
# (
# nmi,
# mutual_clusters,
Expand Down
Loading

0 comments on commit 859e465

Please sign in to comment.