Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jaccard Dissimilarity Function #129

Merged
merged 12 commits into from
Jul 26, 2019
275 changes: 274 additions & 1 deletion kmodes/tests/test_kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sklearn.utils.testing import assert_equal

from kmodes.kmodes import KModes
from kmodes.util.dissim import ng_dissim
from kmodes.util.dissim import ng_dissim, jaccard_dissim_binary, jaccard_dissim_label


SOYBEAN = np.array([
Expand Down Expand Up @@ -124,6 +124,214 @@
# Drop target column
SOYBEAN2 = SOYBEAN2[:, :35]

# SOYBEAN Binary encoded
SOYBEAN3 = np.array([
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
[1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D3'],
[1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
[0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 'D4'],
])
# Drop target column
SOYBEAN3 = SOYBEAN3[:, :35]
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved

SOYBEAN4 = np.array([
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BikashPandey17 , what's the source of this data? This is not the soy bean data (as the name suggests), right?

Copy link
Contributor Author

@BikashPandey17 BikashPandey17 Jul 24, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jaccard dissimilarity was not a good distance metric for the soybean data, as a result, the whole data was considered into a single cluster which looked like this [0 0 0 0 0 ... 0 0 0]. I wanted to put an example where distinct clusters were formed so I used this instead. SOYBEAN4 is actually a misleading name, this is just another categorical data with Label encoding(definitely not related to SOYBEAN).

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rename it then. I'll merge after that.

[2, 22, 14, 45, 2, 0, 1, 2, 5],
[2, 13, 13, 19, 2, 0, 1, 2, 5],
[3, 25, 4, 3, 0, 1, 2, 0, 4],
[2, 13, 15, 18, 0, 1, 2, 2, 3],
[3, 10, 4, 42, 0, 2, 1, 1, 2],
[2, 16, 21, 14, 0, 1, 2, 2, 2],
[2, 16, 19, 37, 0, 2, 1, 2, 2],
[2, 20, 9, 34, 0, 1, 2, 3, 5],
[2, 14, 21, 44, 0, 1, 2, 3, 2],
[2, 26, 5, 30, 0, 1, 2, 3, 3],
[3, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 20, 1, 27, 3, 3, 3, 2, 0],
[3, 6, 8, 19, 0, 1, 2, 1, 2],
[2, 13, 8, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 16, 19, 42, 0, 1, 2, 2, 5],
[7, 7, 5, 43, 0, 2, 1, 2, 2],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[3, 3, 5, 12, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[7, 15, 19, 17, 0, 1, 2, 2, 2],
[1, 1, 15, 24, 0, 1, 2, 2, 2],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 5, 7, 9, 0, 1, 2, 3, 5],
[2, 24, 6, 10, 0, 2, 1, 2, 2],
[2, 13, 16, 29, 0, 2, 1, 2, 2],
[3, 6, 8, 1, 0, 1, 2, 2, 5],
[2, 16, 15, 34, 0, 1, 2, 2, 1],
[0, 24, 14, 12, 3, 3, 3, 2, 0],
[3, 8, 21, 13, 3, 3, 3, 2, 0],
[2, 17, 15, 42, 3, 3, 3, 2, 0],
[2, 25, 18, 16, 3, 3, 3, 2, 0],
[2, 3, 15, 42, 3, 3, 3, 2, 0],
[6, 13, 15, 22, 3, 3, 3, 2, 0],
[3, 8, 18, 24, 1, 0, 2, 2, 5],
[7, 20, 15, 26, 1, 0, 2, 2, 1],
[2, 20, 7, 35, 0, 1, 2, 2, 5],
[2, 16, 12, 28, 0, 1, 2, 2, 5],
[2, 16, 5, 39, 0, 1, 2, 2, 2],
[3, 6, 11, 8, 0, 1, 2, 2, 2],
[7, 6, 15, 44, 1, 0, 2, 2, 4],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 16, 7, 6, 3, 3, 3, 2, 0],
[1, 13, 2, 46, 3, 3, 3, 2, 0],
[0, 14, 5, 41, 3, 3, 3, 2, 0],
[2, 24, 19, 0, 3, 3, 3, 2, 0],
[2, 14, 3, 35, 3, 3, 3, 2, 0],
[6, 19, 7, 5, 0, 2, 1, 2, 2],
[5, 6, 11, 44, 3, 3, 3, 2, 0],
[7, 16, 21, 21, 3, 3, 3, 2, 0],
[2, 19, 7, 44, 3, 3, 3, 2, 0],
[2, 24, 18, 33, 1, 0, 2, 1, 4],
[2, 16, 8, 44, 0, 2, 1, 2, 1],
[3, 2, 5, 15, 0, 1, 2, 2, 2],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 4, 15, 47, 0, 1, 2, 2, 2],
[7, 13, 15, 25, 0, 1, 2, 2, 1],
[1, 19, 10, 15, 3, 3, 3, 2, 0],
[2, 13, 5, 44, 0, 1, 2, 1, 2],
[5, 11, 18, 20, 3, 3, 3, 2, 0],
[7, 9, 5, 40, 0, 1, 2, 1, 4],
[3, 6, 16, 38, 3, 3, 3, 2, 0],
[2, 24, 22, 12, 0, 1, 2, 2, 3],
[5, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 16, 15, 23, 0, 1, 2, 2, 5],
[2, 13, 0, 25, 1, 0, 2, 2, 2],
[2, 23, 15, 36, 3, 3, 3, 2, 0],
[2, 25, 10, 2, 1, 0, 2, 2, 5],
[2, 21, 7, 4, 1, 0, 2, 2, 1],
[1, 18, 17, 41, 3, 3, 3, 2, 0],
[2, 18, 17, 41, 3, 3, 3, 2, 0],
[6, 9, 1, 0, 3, 3, 3, 2, 0],
[1, 7, 20, 47, 3, 3, 3, 2, 0],
[2, 25, 10, 7, 0, 1, 2, 2, 2],
[7, 0, 4, 32, 1, 2, 0, 2, 5],
[1, 12, 12, 15, 0, 1, 2, 3, 3],
[2, 26, 15, 25, 0, 1, 2, 0, 5],
[2, 20, 15, 19, 0, 1, 2, 2, 1],
[4, 6, 9, 11, 2, 0, 1, 1, 4],
[2, 13, 15, 42, 0, 2, 1, 2, 2],
[3, 5, 21, 31, 0, 1, 2, 3, 5],
[2, 13, 19, 33, 0, 2, 1, 2, 2],
[1, 11, 10, 0, 0, 2, 1, 0, 2]
])

SOYBEAN5 = np.array([
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
[1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D1'],
[1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 'D2'],
[0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 'D3'],
[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 1, 'D4'],
])
# Drop target column
SOYBEAN5 = SOYBEAN5[:, :35]

SOYBEAN6 = np.array([
[2, 22, 14, 45, 2, 0, 1, 2, 5],
[7, 13, 13, 19, 2, 0, 1, 2, 5],
[5, 18, 19, 33, 0, 2, 1, 2, 2],
[1, 11, 10, 0, 0, 2, 1, 0, 2]
])


def assert_cluster_splits_equal(array1, array2):

Expand Down Expand Up @@ -334,6 +542,71 @@ def test_kmodes_nunique_nclusters_ng(self):
np.array([[0, 2],
[0, 1]]))

def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_binary, random_state=42)
result = kmodes_huang.fit_predict(SOYBEAN3)
expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3,
3, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
cat_dissim=jaccard_dissim_binary, random_state=42)
result = kmodes_Cao.fit_predict(SOYBEAN3)
expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2,
2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_binary, random_state=42)
kmodes_huang = kmodes_huang.fit(SOYBEAN3)
result = kmodes_huang.fit_predict(SOYBEAN5)
expected = np.array([1, 0, 1, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_huang_soybean_jaccard_dissim_label(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_label, random_state=42)
result = kmodes_huang.fit_predict(SOYBEAN4)
expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2,
0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1,
0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_cao_soybean_jaccard_dissim_label(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
cat_dissim=jaccard_dissim_label, random_state=42)
result = kmodes_huang.fit_predict(SOYBEAN4)
expected = np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0])

assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))

def test_kmodes_predict_soybean_jaccard_dissim_label(self):
kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
cat_dissim=jaccard_dissim_label, random_state=42)
kmodes_huang = kmodes_huang.fit(SOYBEAN4)
result = kmodes_huang.fit_predict(SOYBEAN6)
expected = np.array([0, 0, 0, 0])
assert_cluster_splits_equal(result, expected)
self.assertTrue(result.dtype == np.dtype(np.uint16))


def test_kmodes_ninit(self):
kmodes = KModes(n_init=10, init='Huang')
self.assertEqual(kmodes.n_init, 10)
Expand Down
22 changes: 22 additions & 0 deletions kmodes/util/dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,28 @@ def matching_dissim(a, b, **_):
return np.sum(a != b, axis=1)


def jaccard_dissim_binary(a, b, **__):
"""Jaccard dissimilarity function for biinary encoded variables"""
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2:
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Missing or non Binary values detected in Binary columns.")
return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)


def jaccard_dissim_label(a, b, **__):
"""Jaccard dissimilarity function for label encoded variables"""
if (a.astype(int) < 0).any() or (b.astype(int) < 0).any():
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Missing values detected in Numeric columns.")
intersect_len = np.empty(len(a), dtype=int)
union_len = np.empty(len(a), dtype=int)
i = 0
for row in a:
intersect_len[i] = len(np.intersect1d(row, b))
union_len[i] = len(row) + len(b) - intersect_len[i]
# union_len[i] = np.unique(np.concatenate((row, b)))
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
i = i + 1
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
return intersect_len / union_len
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please handle exceptional case where union_len == 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should never actually happen unless both my arrays come in empty. Still, I'll put up a check, in case this situation arises.



def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function"""
if np.isnan(a).any() or np.isnan(b).any():
Expand Down
22 changes: 22 additions & 0 deletions kmodes/util/tests/test_dissim.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sklearn.utils.testing import assert_equal, assert_array_equal

from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim
from kmodes.util.dissim import jaccard_dissim_binary, jaccard_dissim_label


class TestDissimilarityMeasures(unittest.TestCase):
Expand All @@ -25,6 +26,27 @@ def test_matching_dissim(self):
b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']])
assert_array_equal(np.array([0, 4]), matching_dissim(a, b))

def test_jaccard_dissim_binary(self):
nicodv marked this conversation as resolved.
Show resolved Hide resolved
a = np.array([[0, 1, 1, 0, 1, 1]])
b = np.array([[0, 1, 1, 0, 1, 0]])
assert_equal(0.75, jaccard_dissim_binary(a, b))

a = np.array([[0, 1, 1, 0, 1, 1]])
BikashPandey17 marked this conversation as resolved.
Show resolved Hide resolved
b = np.array([[0, np.NaN, 1, 0, 1, 0]])
with self.assertRaises(ValueError):
jaccard_dissim_binary(a, b)

def test_jaccard_dissim_label(self):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add tests where dissimilarity is 0 and 1.

Copy link
Contributor Author

@BikashPandey17 BikashPandey17 Jul 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nicodv Just to be sure I should be calculating Jaccard distance instead of Jaccard coefficient? Have a look here https://people.revoledu.com/kardi/tutorial/Similarity/Jaccard.html

Copy link
Owner

@nicodv nicodv Jul 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhh, that's correct!

It looks like you're doing the Jaccard coefficient now. You'll need to add 1 - x and update the tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I did that.

a = np.array([[0, 1, 2, 0, 1, 2]])
b = np.array([[0, 1, 2, 0, 1, 0]])
assert_equal(0.75, jaccard_dissim_label(a, b))

a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
b = np.array([[0, 1, 2, 0, 1, 0]])
with self.assertRaises(ValueError):
jaccard_dissim_label(a, b)


def test_euclidian_dissim(self):
a = np.array([[0., 1., 2., 0., 1., 2.]])
b = np.array([[3., 1., 3., 0., 1., 0.]])
Expand Down