diff --git a/README.md b/README.md index 2333a41..773bda2 100644 --- a/README.md +++ b/README.md @@ -54,4 +54,4 @@ Additional issues can be found by running `python3 -m flake8 .` and `python3 -m ## Tests -This code has test coverage for python 3.10, 3.11, and 3.12. +This code is built and tested for python 3.10, 3.11, and 3.12. diff --git a/RUNBOOK.md b/RUNBOOK.md index ec9c5c9..80025a6 100644 --- a/RUNBOOK.md +++ b/RUNBOOK.md @@ -70,6 +70,8 @@ $ python -m unittest discover -v ### Given opinion partitions for each of the topics, compute the multiway alignment score of all of them +#### 1. Perfect Alignment + ```python # import needed libraries >>> import pandas as pd @@ -81,33 +83,126 @@ $ python -m unittest discover -v { # on topic A, individuals 0 and 1 have opinion 0, # individuals 2 and 3 have opinion 1 - "A": [0, 0, 1, 1], - "B": [0, 1, 0, 1], - "C": [1, 0, 1, 0] + "A": [0, 0, 0, 0, 1, 1, 1, 1], + "B": [0, 0, 0, 0, 1, 1, 1, 1], + "C": [1, 1, 1, 1, 0, 0, 0, 0], } ) -# get list of labels for the consensus partition ->>> partition_labels = mac.get_consensus_labels(opinions=df) + +# compute 3-way alignment score using NMI (normalized mutual info score) +>>> mas.multiway_alignment_score( +... df, which_score="nmi", adjusted=False, +... ) +1.0 + +# compute 3-way alignment score using NMI (normalized mutual info score) +# and adjust with the null model +>>> mas.multiway_alignment_score( +... df, which_score="nmi", adjusted=True, +... ) +0.8767167706710732 + +# compute 3-way alignment score using AMI (adjusted mutual info score) +>>> mas.multiway_alignment_score( +... df, which_score="ami", adjusted=False, +... ) +1.0 # compute 3-way alignment score using AMI (adjusted mutual info score) # and adjust with the null model >>> mas.multiway_alignment_score( -... df, partition_labels, which_score="ami", adjusted=True, +... df, which_score="ami", adjusted=True, +... ) +0.933281539775369 +``` + +In this example, we computed multiway alignment for a perfectly aligned system of 8 individuals and 3 topics. +Both the multiway alignment scores obtained by using NMI and by using AMI give 1 (perfect alignment). However, in case we are dealing with a sample of the population, we might want to account for the number of individuals, that here is quite small. To do so, we can adjust the scores with the null model. The resulting alignment score is still quite high, but accounts for the fact that this perfect alignment we are seeing among 8 individuals might be arising by chance. + +With a growing number of individuals, the effect of alignment arising from random chance is smaller, and the score does not overfit: + +```python +>>> n_individuals = 1000 +>>> df = pd.DataFrame( + { + "A": [0] * int(n_individuals/2) + [1] * int(n_individuals/2), + "B": [0] * int(n_individuals/2) + [1] * int(n_individuals/2), + "C": [0] * int(n_individuals/2) + [1] * int(n_individuals/2), + } +) + + +# compute 3-way alignment score using NMI (normalized mutual info score) +>>> mas.multiway_alignment_score( +... df, which_score="nmi", adjusted=False, ... ) -6.40685300762983e-16 +1.0 # compute 3-way alignment score using NMI (normalized mutual info score) # and adjust with the null model >>> mas.multiway_alignment_score( -... df, partition_labels, which_score="nmi", adjusted=True, +... df, which_score="nmi", adjusted=True, ... ) -0.0 +0.9991381194997214 -# if we use NMI (normalized mutual info score) without adjusting it -# with a null model, the resulting score is inflated +# compute 3-way alignment score using AMI (adjusted mutual info score) >>> mas.multiway_alignment_score( -... df, partition_labels, which_score="nmi", adjusted=False, +... df, which_score="ami", adjusted=False, ... ) -0.6666666666666666 +1.0 + +# compute 3-way alignment score using AMI (adjusted mutual info score) +# and adjust with the null model +>>> mas.multiway_alignment_score( +... df, which_score="ami", adjusted=True, +... ) +0.9998316111697388 +``` + +#### 2. No Alignment + +```python +>>> n_individuals = 10000 +>>> opinions = np.array([0] * int(n_individuals/2) + [1] * int(n_individuals/2)) +>>> o1 = opinions.copy() +>>> np.random.shuffle(opinions) +>>> o2 = opinions.copy() +>>> np.random.shuffle(opinions) +>>> o3 = opinions.copy() +>>> df = pd.DataFrame( + { + "A": o1, + "B": o2, + "C": o3, + } +) + +# compute 3-way alignment score using NMI (normalized mutual info score) +>>> mas.multiway_alignment_score( +... df, which_score="nmi", adjusted=False, +... ) +0.0002596921754934203 + +# compute 3-way alignment score using NMI (normalized mutual info score) +# and adjust with the null model +>>> mas.multiway_alignment_score( +... df, which_score="nmi", adjusted=True, +... ) +0.00014352480953112452 + +# compute 3-way alignment score using AMI (adjusted mutual info score) +>>> mas.multiway_alignment_score( +... df, which_score="ami", adjusted=False, +... ) +0.00011540052022510332 + +# compute 3-way alignment score using AMI (adjusted mutual info score) +# and adjust with the null model +>>> mas.multiway_alignment_score( +... df, which_score="ami", adjusted=True, +... ) +5.4181170472338464e-05 ``` + +For a random system, multiway alignment score approaches 0. diff --git a/multiway_alignment/data/anes.py b/multiway_alignment/data/anes.py index b579cca..76aed39 100644 --- a/multiway_alignment/data/anes.py +++ b/multiway_alignment/data/anes.py @@ -94,7 +94,7 @@ def compute_all_alignments(timeseries: pd.DataFrame) -> None: .reset_index(drop=True) ) dump_name = f"survey_{year}" - full, _ = mw_score.maximal_alignment_curve_nminusone( + full, _ = mw_score.maximal_alignment_curve( opinions=_df, which_score="ami", adjusted=False, @@ -103,7 +103,7 @@ def compute_all_alignments(timeseries: pd.DataFrame) -> None: dump(full, dump_name + "_nminus1_ami_full") print("null model") - mw_null.random_full_alignment_curves_kminusone( + mw_null.random_full_alignment_curves( df=_df, save_to=dump_name + "_nminus1_null", which_score="ami", diff --git a/multiway_alignment/null_models.py b/multiway_alignment/null_models.py index 1b9d88f..e0ef25b 100644 --- a/multiway_alignment/null_models.py +++ b/multiway_alignment/null_models.py @@ -31,7 +31,7 @@ def get_null_model(opinions: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: return null -def _one_iter( +def _one_iter_fullpartition( opinions: Union[pd.DataFrame, pd.Series], which_score: str = "ami", adjusted: bool = False, @@ -45,13 +45,13 @@ def _one_iter( """ null = get_null_model(opinions=opinions) - _full_res, _ = ma_score.maximal_alignment_curve( + _full_res, _ = ma_score.maximal_alignment_curve_fullpartition( null, which_score=which_score, adjusted=adjusted ) return _full_res -def _one_iter_kminusone( +def _one_iter( opinions: Union[pd.DataFrame, pd.Series], which_score: str = "ami", adjusted: bool = False, @@ -65,13 +65,13 @@ def _one_iter_kminusone( """ null = get_null_model(opinions=opinions) - _full_res, _ = ma_score.maximal_alignment_curve_nminusone( + _full_res, _ = ma_score.maximal_alignment_curve( null, which_score=which_score, adjusted=adjusted ) return _full_res -def random_full_alignment_curves( +def random_full_alignment_curves_fullpartition( df: pd.DataFrame, save_to: str, which_score: str = "ami", @@ -97,7 +97,10 @@ def random_full_alignment_curves( logger.info(f"Created new directory {save_to}") with Pool(processes=mp.cpu_count() - 1) as pool: result = pool.map_async( - partial(_one_iter, **{"which_score": which_score, "adjusted": adjusted}), + partial( + _one_iter_fullpartition, + **{"which_score": which_score, "adjusted": adjusted}, # type: ignore + ), [df.copy()] * n_tries, ) i = 0 @@ -106,7 +109,7 @@ def random_full_alignment_curves( i += 1 -def random_full_alignment_curves_kminusone( +def random_full_alignment_curves( df: pd.DataFrame, save_to: str, which_score: str = "ami", @@ -133,8 +136,8 @@ def random_full_alignment_curves_kminusone( with Pool(processes=mp.cpu_count() - 1) as pool: result = pool.map_async( partial( - _one_iter_kminusone, - **{"which_score": which_score, "adjusted": adjusted}, + _one_iter, + **{"which_score": which_score, "adjusted": adjusted}, # type: ignore ), [df.copy()] * n_tries, ) @@ -144,7 +147,9 @@ def random_full_alignment_curves_kminusone( i += 1 -def expected_curve(opinions: Union[pd.DataFrame, pd.Series]) -> List[float]: +def expected_curve_fullpartition( + opinions: Union[pd.DataFrame, pd.Series] +) -> List[float]: """ :param opinions: pd.DataFrame having one column per layer and one row per node, where each element a_ij is an integer representing the cluster labels for node i at layer j @@ -181,5 +186,5 @@ def expected_curve(opinions: Union[pd.DataFrame, pd.Series]) -> List[float]: return _expected_best_scores -def expected_curve_equal_sized_clusters(n_layers: int) -> List[float]: +def expected_curve_fullpartition_equal_sized_clusters(n_layers: int) -> List[float]: return [2 / (1 + k) for k in range(2, n_layers + 1)] diff --git a/multiway_alignment/score.py b/multiway_alignment/score.py index 45561dd..64eddba 100644 --- a/multiway_alignment/score.py +++ b/multiway_alignment/score.py @@ -37,7 +37,7 @@ def _layer_expectation( return np.array(_all_scores).mean() -def multiway_alignment_score_nminusone( +def multiway_alignment_score( opinions: typing.Union[pd.DataFrame, pd.Series], which_score: str = "nmi", adjusted: bool = False, @@ -76,7 +76,7 @@ def multiway_alignment_score_nminusone( return (avg_nmi - _expected_nmi) / len(opinions.columns) -def multiway_alignment_score( +def multiway_alignment_score_fullpartition( opinions: typing.Union[pd.DataFrame, pd.Series], mutual_clusters_labels: typing.List, which_score: str = "nmi", @@ -114,7 +114,7 @@ def multiway_alignment_score( return (avg_nmi - _expected_nmi) / len(opinions.columns) -def maximal_alignment_curve_nminusone( +def maximal_alignment_curve( opinions: typing.Union[pd.DataFrame, pd.Series], which_score: str = "nmi", adjusted: bool = False, @@ -161,7 +161,7 @@ def maximal_alignment_curve_nminusone( l_comb_df.reset_index(drop=True, inplace=True) # CRITERIA - nmi = multiway_alignment_score_nminusone( + nmi = multiway_alignment_score( l_comb_df, which_score=which_score, adjusted=adjusted ) @@ -189,7 +189,7 @@ def maximal_alignment_curve_nminusone( return all_scores_by_combination_size, best_by_combination_size -def maximal_alignment_curve( +def maximal_alignment_curve_fullpartition( opinions: typing.Union[pd.DataFrame, pd.Series], which_score: str = "nmi", adjusted: bool = False, @@ -239,7 +239,7 @@ def maximal_alignment_curve( labels_list = get_consensus_labels(opinions=l_comb_df) # CRITERIA - nmi = multiway_alignment_score( + nmi = multiway_alignment_score_fullpartition( l_comb_df, labels_list, which_score=which_score, adjusted=adjusted ) diff --git a/setup.py b/setup.py index cfe6ce6..de6cc37 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="multiway_alignment", - version="0.1.0", + version="0.0.1", packages=find_packages(), author="Letizia Iannucci", author_email="letizia.iannucci@aalto.fi", diff --git a/tests/test_compute_maximal_alignment_curve.py b/tests/test_compute_maximal_alignment_curve.py deleted file mode 100644 index 3454245..0000000 --- a/tests/test_compute_maximal_alignment_curve.py +++ /dev/null @@ -1,65 +0,0 @@ -import unittest - -import pandas as pd - -from multiway_alignment.score import maximal_alignment_curve - - -class TestComputeMaximalAlignmentCurve(unittest.TestCase): - """ - Test functionality of score.compute_maximal_alignment_curve() - ------------ - Example - ------------ - >>> python3 -m unittest -v tests.test_compute_maximal_alignment_curve - """ - - def test_on_empty(self): - """ - compute_maximal_alignment_curve returns a tuple with two dictionaries - """ - _a = pd.DataFrame() - _resall, _res0 = maximal_alignment_curve(_a) - self.assertIsInstance( - _resall, - dict, - f"""compute_maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_resall)}""", - ) - self.assertDictEqual( - _resall, - dict(), - f"""compute_maximal_alignment_curve on empty input should return a tuple with two empty dictionaries, - but one was {_resall}""", - ) - self.assertIsInstance( - _res0, - dict, - f"""compute_maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_res0)}""", - ) - self.assertDictEqual( - _res0, - dict(), - f"""compute_maximal_alignment_curve on empty input should return a tuple with two empty dictionaries, - but one was {_res0}""", - ) - - def test_on_one_layer(self): - """ - compute_maximal_alignment_curve returns a tuple with two dictionaries - """ - _a = pd.DataFrame({"A": [0, 1, 2]}) - _resall, _res0 = maximal_alignment_curve(_a) - self.assertIsInstance( - _resall, - dict, - f"""compute_maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_resall)}""", - ) - self.assertIsInstance( - _res0, - dict, - f"""compute_maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_res0)}""", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_compute_multilayer_alignment_score.py b/tests/test_compute_multilayer_alignment_score.py deleted file mode 100644 index e22f589..0000000 --- a/tests/test_compute_multilayer_alignment_score.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest - -import pandas as pd - -from multiway_alignment.score import multiway_alignment_score - - -class TestComputeMultiwayAlignmentScore(unittest.TestCase): - """ - Test functionality of mutual_clusters.compute_multiway_alignment_score() - ------------ - Example - ------------ - >>> python3 -m unittest -v tests.test_compute_multiway_alignment_score - """ - - def test_on_empty(self): - """ - compute_multiway_alignment_score returns a float - """ - _a = pd.DataFrame({"A": [0, 1, 2]}) - _labels = ["a", "b", "c"] - _res0 = multiway_alignment_score(_a, _labels) - self.assertIsInstance( - _res0, - float, - f"""compute_multiway_alignment_score should return a float, but returned {type(_res0)}""", - ) - self.assertGreaterEqual( - _res0, - 0.0, - """compute_multiway_alignment_score should return the correct value of avg NMI""", - ) - self.assertLessEqual( - _res0, - 1.0, - """compute_multiway_alignment_score should return the correct value of avg NMI""", - ) - self.assertEqual( - _res0, - 1.0, - """compute_multiway_alignment_score should return the correct value of avg NMI""", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_maximal_alignment_curve.py b/tests/test_maximal_alignment_curve.py new file mode 100644 index 0000000..4722263 --- /dev/null +++ b/tests/test_maximal_alignment_curve.py @@ -0,0 +1,65 @@ +import unittest + +import pandas as pd + +from multiway_alignment.score import maximal_alignment_curve + + +class TestComputeMaximalAlignmentCurve(unittest.TestCase): + """ + Test functionality of score.maximal_alignment_curve() + ------------ + Example + ------------ + >>> python3 -m unittest -v tests.test_maximal_alignment_curve + """ + + def test_on_empty(self): + """ + maximal_alignment_curve returns a tuple with two dictionaries + """ + _a = pd.DataFrame() + _resall, _res0 = maximal_alignment_curve(_a) + self.assertIsInstance( + _resall, + dict, + f"""maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_resall)}""", + ) + self.assertDictEqual( + _resall, + dict(), + f"""maximal_alignment_curve on empty input should return a tuple with two empty dictionaries, + but one was {_resall}""", + ) + self.assertIsInstance( + _res0, + dict, + f"""maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_res0)}""", + ) + self.assertDictEqual( + _res0, + dict(), + f"""maximal_alignment_curve on empty input should return a tuple with two empty dictionaries, + but one was {_res0}""", + ) + + def test_on_three_dimensions(self): + """ + maximal_alignment_curve returns a tuple with two dictionaries + """ + _a = pd.DataFrame({"A": [0, 1, 2]}, {"B": [0, 1, 2]}, {"C": [0, 1, 2]}) + _resall, _res0 = maximal_alignment_curve(_a) + self.assertIsInstance( + _resall, + dict, + f"""maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_resall)}""", + ) + self.assertIsInstance( + _res0, + dict, + f"""maximal_alignment_curve should return a tuple with two dictionaries, but one was {type(_res0)}""", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_maximal_alignment_curve_full.py b/tests/test_maximal_alignment_curve_full.py new file mode 100644 index 0000000..75622b5 --- /dev/null +++ b/tests/test_maximal_alignment_curve_full.py @@ -0,0 +1,65 @@ +import unittest + +import pandas as pd + +from multiway_alignment.score import maximal_alignment_curve_fullpartition + + +class TestComputeMaximalAlignmentCurveFull(unittest.TestCase): + """ + Test functionality of score.maximal_alignment_curve_fullpartition() + ------------ + Example + ------------ + >>> python3 -m unittest -v tests.test_maximal_alignment_curve_fullpartition + """ + + def test_on_empty(self): + """ + maximal_alignment_curve_fullpartition returns a tuple with two dictionaries + """ + _a = pd.DataFrame() + _resall, _res0 = maximal_alignment_curve_fullpartition(_a) + self.assertIsInstance( + _resall, + dict, + f"""maximal_alignment_curve_fullpartition should return a tuple with two dictionaries, but one was {type(_resall)}""", + ) + self.assertDictEqual( + _resall, + dict(), + f"""maximal_alignment_curve_fullpartition on empty input should return a tuple with two empty dictionaries, + but one was {_resall}""", + ) + self.assertIsInstance( + _res0, + dict, + f"""maximal_alignment_curve_fullpartition should return a tuple with two dictionaries, but one was {type(_res0)}""", + ) + self.assertDictEqual( + _res0, + dict(), + f"""maximal_alignment_curve_fullpartition on empty input should return a tuple with two empty dictionaries, + but one was {_res0}""", + ) + + def test_on_one_layer(self): + """ + maximal_alignment_curve_fullpartition returns a tuple with two dictionaries + """ + _a = pd.DataFrame({"A": [0, 1, 2]}) + _resall, _res0 = maximal_alignment_curve_fullpartition(_a) + self.assertIsInstance( + _resall, + dict, + f"""maximal_alignment_curve_fullpartition should return a tuple with two dictionaries, but one was {type(_resall)}""", + ) + self.assertIsInstance( + _res0, + dict, + f"""maximal_alignment_curve_fullpartition should return a tuple with two dictionaries, but one was {type(_res0)}""", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_multiway_alignment_score.py b/tests/test_multiway_alignment_score.py new file mode 100644 index 0000000..e8bb5c3 --- /dev/null +++ b/tests/test_multiway_alignment_score.py @@ -0,0 +1,96 @@ +import unittest + +import pandas as pd + +from multiway_alignment.score import multiway_alignment_score + + +class TestComputeMultiwayAlignmentScore(unittest.TestCase): + """ + Test functionality of mutual_clusters.multiway_alignment_score() + ------------ + Example + ------------ + >>> python3 -m unittest -v tests.multiway_alignment_score + """ + + def test_on_empty(self): + """ + multiway_alignment_score raises ZeroDivisionError if the dataframe is empty + """ + _a = pd.DataFrame() + # the function should raise ZeroDivisionError if the dataframe is empty + with self.assertRaises(ZeroDivisionError): + multiway_alignment_score(_a) + + def test_on_single_dimension(self): + """ + multiway_alignment_score raises ValueError if there is only one dimension + """ + _a = pd.DataFrame({"A": [0, 1, 2]}) + # the function should raise ValueError if the there is only one dimension + with self.assertRaises(ValueError): + multiway_alignment_score(_a) + + def test_on_two_dimensions(self): + """ + multiway_alignment_score returns a float + """ + _a = pd.DataFrame({"A": [0, 1, 2], "B": [0, 1, 2]}) + _res0 = multiway_alignment_score(_a, "nmi", False) + self.assertIsInstance( + _res0, + float, + f"""multiway_alignment_score should return a float, but returned {type(_res0)}""", + ) + self.assertGreaterEqual( + _res0, + 1.0, + """multiway_alignment_score should return the correct value in case of perfect alignment when nmi is used""", + ) + + _res1 = multiway_alignment_score(_a, "ami", False) + self.assertIsInstance( + _res1, + float, + f"""multiway_alignment_score should return a float, but returned {type(_res1)}""", + ) + self.assertGreaterEqual( + _res1, + 1.0, + """multiway_alignment_score should return the correct value in case of perfect alignment when ami is used""", + ) + + def test_on_three_dimensions(self): + """ + multiway_alignment_score returns a float + """ + _a = pd.DataFrame({"A": [0, 1, 2], "B": [0, 1, 2], "C": [0, 1, 2]}) + _res0 = multiway_alignment_score(_a, "nmi", False) + + self.assertIsInstance( + _res0, + float, + f"""multiway_alignment_score should return a float, but returned {type(_res0)}""", + ) + self.assertGreaterEqual( + _res0, + 1.0, + """multiway_alignment_score should return the correct value in case of perfect alignment when nmi is used""", + ) + + _res1 = multiway_alignment_score(_a, "ami", False) + self.assertIsInstance( + _res1, + float, + f"""multiway_alignment_score should return a float, but returned {type(_res1)}""", + ) + self.assertGreaterEqual( + _res1, + 1.0, + """multiway_alignment_score should return the correct value in case of perfect alignment when ami is used""", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_multiway_alignment_score_full.py b/tests/test_multiway_alignment_score_full.py new file mode 100644 index 0000000..37fbc4a --- /dev/null +++ b/tests/test_multiway_alignment_score_full.py @@ -0,0 +1,47 @@ +import unittest + +import pandas as pd + +from multiway_alignment.score import multiway_alignment_score_fullpartition + + +class TestComputeMultiwayAlignmentScoreFull(unittest.TestCase): + """ + Test functionality of mutual_clusters.multiway_alignment_score_fullpartition() + ------------ + Example + ------------ + >>> python3 -m unittest -v tests.test_multiway_alignment_score_fullpartition + """ + + def test_on_empty(self): + """ + multiway_alignment_score_fullpartition returns a float + """ + _a = pd.DataFrame({"A": [0, 1, 2]}) + _labels = ["a", "b", "c"] + _res0 = multiway_alignment_score_fullpartition(_a, _labels) + self.assertIsInstance( + _res0, + float, + f"""multiway_alignment_score_fullpartition should return a float, but returned {type(_res0)}""", + ) + self.assertGreaterEqual( + _res0, + 0.0, + """multiway_alignment_score_fullpartition should return the correct value of avg NMI""", + ) + self.assertLessEqual( + _res0, + 1.0, + """multiway_alignment_score_fullpartition should return the correct value of avg NMI""", + ) + self.assertEqual( + _res0, + 1.0, + """multiway_alignment_score_fullpartition should return the correct value of avg NMI""", + ) + + +if __name__ == "__main__": + unittest.main()