Skip to content

Commit

Permalink
Feature/main score is kminus1 (#16)
Browse files Browse the repository at this point in the history
* swap names

* update

* update docs

* refactor tests

* refactor tests

* startt version from 0-minor

* ignore type
  • Loading branch information
letiziaia authored Jul 27, 2024
1 parent 5eac2a8 commit 32d6838
Show file tree
Hide file tree
Showing 12 changed files with 407 additions and 146 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ Additional issues can be found by running `python3 -m flake8 .` and `python3 -m

## Tests

This code has test coverage for python 3.10, 3.11, and 3.12.
This code is built and tested for python 3.10, 3.11, and 3.12.
121 changes: 108 additions & 13 deletions RUNBOOK.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ $ python -m unittest discover -v

### Given opinion partitions for each of the topics, compute the multiway alignment score of all of them

#### 1. Perfect Alignment

```python
# import needed libraries
>>> import pandas as pd
Expand All @@ -81,33 +83,126 @@ $ python -m unittest discover -v
{
# on topic A, individuals 0 and 1 have opinion 0,
# individuals 2 and 3 have opinion 1
"A": [0, 0, 1, 1],
"B": [0, 1, 0, 1],
"C": [1, 0, 1, 0]
"A": [0, 0, 0, 0, 1, 1, 1, 1],
"B": [0, 0, 0, 0, 1, 1, 1, 1],
"C": [1, 1, 1, 1, 0, 0, 0, 0],
}
)

# get list of labels for the consensus partition
>>> partition_labels = mac.get_consensus_labels(opinions=df)

# compute 3-way alignment score using NMI (normalized mutual info score)
>>> mas.multiway_alignment_score(
... df, which_score="nmi", adjusted=False,
... )
1.0

# compute 3-way alignment score using NMI (normalized mutual info score)
# and adjust with the null model
>>> mas.multiway_alignment_score(
... df, which_score="nmi", adjusted=True,
... )
0.8767167706710732

# compute 3-way alignment score using AMI (adjusted mutual info score)
>>> mas.multiway_alignment_score(
... df, which_score="ami", adjusted=False,
... )
1.0

# compute 3-way alignment score using AMI (adjusted mutual info score)
# and adjust with the null model
>>> mas.multiway_alignment_score(
... df, partition_labels, which_score="ami", adjusted=True,
... df, which_score="ami", adjusted=True,
... )
0.933281539775369
```

In this example, we computed multiway alignment for a perfectly aligned system of 8 individuals and 3 topics.
Both the multiway alignment scores obtained by using NMI and by using AMI give 1 (perfect alignment). However, in case we are dealing with a sample of the population, we might want to account for the number of individuals, that here is quite small. To do so, we can adjust the scores with the null model. The resulting alignment score is still quite high, but accounts for the fact that this perfect alignment we are seeing among 8 individuals might be arising by chance.

With a growing number of individuals, the effect of alignment arising from random chance is smaller, and the score does not overfit:

```python
>>> n_individuals = 1000
>>> df = pd.DataFrame(
{
"A": [0] * int(n_individuals/2) + [1] * int(n_individuals/2),
"B": [0] * int(n_individuals/2) + [1] * int(n_individuals/2),
"C": [0] * int(n_individuals/2) + [1] * int(n_individuals/2),
}
)


# compute 3-way alignment score using NMI (normalized mutual info score)
>>> mas.multiway_alignment_score(
... df, which_score="nmi", adjusted=False,
... )
6.40685300762983e-16
1.0

# compute 3-way alignment score using NMI (normalized mutual info score)
# and adjust with the null model
>>> mas.multiway_alignment_score(
... df, partition_labels, which_score="nmi", adjusted=True,
... df, which_score="nmi", adjusted=True,
... )
0.0
0.9991381194997214

# if we use NMI (normalized mutual info score) without adjusting it
# with a null model, the resulting score is inflated
# compute 3-way alignment score using AMI (adjusted mutual info score)
>>> mas.multiway_alignment_score(
... df, partition_labels, which_score="nmi", adjusted=False,
... df, which_score="ami", adjusted=False,
... )
0.6666666666666666
1.0

# compute 3-way alignment score using AMI (adjusted mutual info score)
# and adjust with the null model
>>> mas.multiway_alignment_score(
... df, which_score="ami", adjusted=True,
... )
0.9998316111697388
```

#### 2. No Alignment

```python
>>> n_individuals = 10000
>>> opinions = np.array([0] * int(n_individuals/2) + [1] * int(n_individuals/2))
>>> o1 = opinions.copy()
>>> np.random.shuffle(opinions)
>>> o2 = opinions.copy()
>>> np.random.shuffle(opinions)
>>> o3 = opinions.copy()
>>> df = pd.DataFrame(
{
"A": o1,
"B": o2,
"C": o3,
}
)

# compute 3-way alignment score using NMI (normalized mutual info score)
>>> mas.multiway_alignment_score(
... df, which_score="nmi", adjusted=False,
... )
0.0002596921754934203

# compute 3-way alignment score using NMI (normalized mutual info score)
# and adjust with the null model
>>> mas.multiway_alignment_score(
... df, which_score="nmi", adjusted=True,
... )
0.00014352480953112452

# compute 3-way alignment score using AMI (adjusted mutual info score)
>>> mas.multiway_alignment_score(
... df, which_score="ami", adjusted=False,
... )
0.00011540052022510332

# compute 3-way alignment score using AMI (adjusted mutual info score)
# and adjust with the null model
>>> mas.multiway_alignment_score(
... df, which_score="ami", adjusted=True,
... )
5.4181170472338464e-05
```

For a random system, multiway alignment score approaches 0.
4 changes: 2 additions & 2 deletions multiway_alignment/data/anes.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def compute_all_alignments(timeseries: pd.DataFrame) -> None:
.reset_index(drop=True)
)
dump_name = f"survey_{year}"
full, _ = mw_score.maximal_alignment_curve_nminusone(
full, _ = mw_score.maximal_alignment_curve(
opinions=_df,
which_score="ami",
adjusted=False,
Expand All @@ -103,7 +103,7 @@ def compute_all_alignments(timeseries: pd.DataFrame) -> None:
dump(full, dump_name + "_nminus1_ami_full")

print("null model")
mw_null.random_full_alignment_curves_kminusone(
mw_null.random_full_alignment_curves(
df=_df,
save_to=dump_name + "_nminus1_null",
which_score="ami",
Expand Down
27 changes: 16 additions & 11 deletions multiway_alignment/null_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_null_model(opinions: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
return null


def _one_iter(
def _one_iter_fullpartition(
opinions: Union[pd.DataFrame, pd.Series],
which_score: str = "ami",
adjusted: bool = False,
Expand All @@ -45,13 +45,13 @@ def _one_iter(
"""
null = get_null_model(opinions=opinions)

_full_res, _ = ma_score.maximal_alignment_curve(
_full_res, _ = ma_score.maximal_alignment_curve_fullpartition(
null, which_score=which_score, adjusted=adjusted
)
return _full_res


def _one_iter_kminusone(
def _one_iter(
opinions: Union[pd.DataFrame, pd.Series],
which_score: str = "ami",
adjusted: bool = False,
Expand All @@ -65,13 +65,13 @@ def _one_iter_kminusone(
"""
null = get_null_model(opinions=opinions)

_full_res, _ = ma_score.maximal_alignment_curve_nminusone(
_full_res, _ = ma_score.maximal_alignment_curve(
null, which_score=which_score, adjusted=adjusted
)
return _full_res


def random_full_alignment_curves(
def random_full_alignment_curves_fullpartition(
df: pd.DataFrame,
save_to: str,
which_score: str = "ami",
Expand All @@ -97,7 +97,10 @@ def random_full_alignment_curves(
logger.info(f"Created new directory {save_to}")
with Pool(processes=mp.cpu_count() - 1) as pool:
result = pool.map_async(
partial(_one_iter, **{"which_score": which_score, "adjusted": adjusted}),
partial(
_one_iter_fullpartition,
**{"which_score": which_score, "adjusted": adjusted}, # type: ignore
),
[df.copy()] * n_tries,
)
i = 0
Expand All @@ -106,7 +109,7 @@ def random_full_alignment_curves(
i += 1


def random_full_alignment_curves_kminusone(
def random_full_alignment_curves(
df: pd.DataFrame,
save_to: str,
which_score: str = "ami",
Expand All @@ -133,8 +136,8 @@ def random_full_alignment_curves_kminusone(
with Pool(processes=mp.cpu_count() - 1) as pool:
result = pool.map_async(
partial(
_one_iter_kminusone,
**{"which_score": which_score, "adjusted": adjusted},
_one_iter,
**{"which_score": which_score, "adjusted": adjusted}, # type: ignore
),
[df.copy()] * n_tries,
)
Expand All @@ -144,7 +147,9 @@ def random_full_alignment_curves_kminusone(
i += 1


def expected_curve(opinions: Union[pd.DataFrame, pd.Series]) -> List[float]:
def expected_curve_fullpartition(
opinions: Union[pd.DataFrame, pd.Series]
) -> List[float]:
"""
:param opinions: pd.DataFrame having one column per layer and one row per node,
where each element a_ij is an integer representing the cluster labels for node i at layer j
Expand Down Expand Up @@ -181,5 +186,5 @@ def expected_curve(opinions: Union[pd.DataFrame, pd.Series]) -> List[float]:
return _expected_best_scores


def expected_curve_equal_sized_clusters(n_layers: int) -> List[float]:
def expected_curve_fullpartition_equal_sized_clusters(n_layers: int) -> List[float]:
return [2 / (1 + k) for k in range(2, n_layers + 1)]
12 changes: 6 additions & 6 deletions multiway_alignment/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _layer_expectation(
return np.array(_all_scores).mean()


def multiway_alignment_score_nminusone(
def multiway_alignment_score(
opinions: typing.Union[pd.DataFrame, pd.Series],
which_score: str = "nmi",
adjusted: bool = False,
Expand Down Expand Up @@ -76,7 +76,7 @@ def multiway_alignment_score_nminusone(
return (avg_nmi - _expected_nmi) / len(opinions.columns)


def multiway_alignment_score(
def multiway_alignment_score_fullpartition(
opinions: typing.Union[pd.DataFrame, pd.Series],
mutual_clusters_labels: typing.List,
which_score: str = "nmi",
Expand Down Expand Up @@ -114,7 +114,7 @@ def multiway_alignment_score(
return (avg_nmi - _expected_nmi) / len(opinions.columns)


def maximal_alignment_curve_nminusone(
def maximal_alignment_curve(
opinions: typing.Union[pd.DataFrame, pd.Series],
which_score: str = "nmi",
adjusted: bool = False,
Expand Down Expand Up @@ -161,7 +161,7 @@ def maximal_alignment_curve_nminusone(
l_comb_df.reset_index(drop=True, inplace=True)

# CRITERIA
nmi = multiway_alignment_score_nminusone(
nmi = multiway_alignment_score(
l_comb_df, which_score=which_score, adjusted=adjusted
)

Expand Down Expand Up @@ -189,7 +189,7 @@ def maximal_alignment_curve_nminusone(
return all_scores_by_combination_size, best_by_combination_size


def maximal_alignment_curve(
def maximal_alignment_curve_fullpartition(
opinions: typing.Union[pd.DataFrame, pd.Series],
which_score: str = "nmi",
adjusted: bool = False,
Expand Down Expand Up @@ -239,7 +239,7 @@ def maximal_alignment_curve(
labels_list = get_consensus_labels(opinions=l_comb_df)

# CRITERIA
nmi = multiway_alignment_score(
nmi = multiway_alignment_score_fullpartition(
l_comb_df, labels_list, which_score=which_score, adjusted=adjusted
)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="multiway_alignment",
version="0.1.0",
version="0.0.1",
packages=find_packages(),
author="Letizia Iannucci",
author_email="letizia.iannucci@aalto.fi",
Expand Down
65 changes: 0 additions & 65 deletions tests/test_compute_maximal_alignment_curve.py

This file was deleted.

Loading

0 comments on commit 32d6838

Please sign in to comment.