From 590a51f6dd704a85971df1327e54b7b9f99cf6f4 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Sun, 21 Jul 2019 13:36:20 +0530 Subject: [PATCH 01/12] Added jaccard dissimilarity function for label encoded and binary encoded variables along with related unit tests --- kmodes/tests/test_kmodes.py | 276 ++++++++++++++++++++++++++++++- kmodes/util/dissim.py | 21 +++ kmodes/util/tests/test_dissim.py | 23 ++- 3 files changed, 318 insertions(+), 2 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 2811159..6f7f3bd 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -9,7 +9,7 @@ from sklearn.utils.testing import assert_equal from kmodes.kmodes import KModes -from kmodes.util.dissim import ng_dissim +from kmodes.util.dissim import ng_dissim, jaccard_dissim_binary, jaccard_dissim_label SOYBEAN = np.array([ @@ -124,6 +124,214 @@ # Drop target column SOYBEAN2 = SOYBEAN2[:, :35] +# SOYBEAN Binary encoded +SOYBEAN3 = np.array([ + [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D3'], + [1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], + [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 'D4'], +]) +# Drop target column +SOYBEAN3 = SOYBEAN3[:, :35] + +SOYBEAN4 = np.array([ + [ 2, 22, 14, 45, 2, 0, 1, 2, 5], + [ 2, 13, 13, 19, 2, 0, 1, 2, 5], + [ 3, 25, 4, 3, 0, 1, 2, 0, 4], + [ 2, 13, 15, 18, 0, 1, 2, 2, 3], + [ 3, 10, 4, 42, 0, 2, 1, 1, 2], + [ 2, 16, 21, 14, 0, 1, 2, 2, 2], + [ 2, 16, 19, 37, 0, 2, 1, 2, 2], + [ 2, 20, 9, 34, 0, 1, 2, 3, 5], + [ 2, 14, 21, 44, 0, 1, 2, 3, 2], + [ 2, 26, 5, 30, 0, 1, 2, 3, 3], + [ 3, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 20, 1, 27, 3, 3, 3, 2, 0], + [ 3, 6, 8, 19, 0, 1, 2, 1, 2], + [ 2, 13, 8, 41, 3, 3, 3, 2, 0], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 16, 19, 42, 0, 1, 2, 2, 5], + [ 7, 7, 5, 43, 0, 2, 1, 2, 2], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 3, 3, 5, 12, 3, 3, 3, 2, 0], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 7, 15, 19, 17, 0, 1, 2, 2, 2], + [ 1, 1, 15, 24, 0, 1, 2, 2, 2], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 5, 7, 9, 0, 1, 2, 3, 5], + [ 2, 24, 6, 10, 0, 2, 1, 2, 2], + [ 2, 13, 16, 29, 0, 2, 1, 2, 2], + [ 3, 6, 8, 1, 0, 1, 2, 2, 5], + [ 2, 16, 15, 34, 0, 1, 2, 2, 1], + [ 0, 24, 14, 12, 3, 3, 3, 2, 0], + [ 3, 8, 21, 13, 3, 3, 3, 2, 0], + [ 2, 17, 15, 42, 3, 3, 3, 2, 0], + [ 2, 25, 18, 16, 3, 3, 3, 2, 0], + [ 2, 3, 15, 42, 3, 3, 3, 2, 0], + [ 6, 13, 15, 22, 3, 3, 3, 2, 0], + [ 3, 8, 18, 24, 1, 0, 2, 2, 5], + [ 7, 20, 15, 26, 1, 0, 2, 2, 1], + [ 2, 20, 7, 35, 0, 1, 2, 2, 5], + [ 2, 16, 12, 28, 0, 1, 2, 2, 5], + [ 2, 16, 5, 39, 0, 1, 2, 2, 2], + [ 3, 6, 11, 8, 0, 1, 2, 2, 2], + [ 7, 6, 15, 44, 1, 0, 2, 2, 4], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 16, 7, 6, 3, 3, 3, 2, 0], + [ 1, 13, 2, 46, 3, 3, 3, 2, 0], + [ 0, 14, 5, 41, 3, 3, 3, 2, 0], + [ 2, 24, 19, 0, 3, 3, 3, 2, 0], + [ 2, 14, 3, 35, 3, 3, 3, 2, 0], + [ 6, 19, 7, 5, 0, 2, 1, 2, 2], + [ 5, 6, 11, 44, 3, 3, 3, 2, 0], + [ 7, 16, 21, 21, 3, 3, 3, 2, 0], + [ 2, 19, 7, 44, 3, 3, 3, 2, 0], + [ 2, 24, 18, 33, 1, 0, 2, 1, 4], + [ 2, 16, 8, 44, 0, 2, 1, 2, 1], + [ 3, 2, 5, 15, 0, 1, 2, 2, 2], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 4, 15, 47, 0, 1, 2, 2, 2], + [ 7, 13, 15, 25, 0, 1, 2, 2, 1], + [ 1, 19, 10, 15, 3, 3, 3, 2, 0], + [ 2, 13, 5, 44, 0, 1, 2, 1, 2], + [ 5, 11, 18, 20, 3, 3, 3, 2, 0], + [ 7, 9, 5, 40, 0, 1, 2, 1, 4], + [ 3, 6, 16, 38, 3, 3, 3, 2, 0], + [ 2, 24, 22, 12, 0, 1, 2, 2, 3], + [ 5, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 16, 15, 23, 0, 1, 2, 2, 5], + [ 2, 13, 0, 25, 1, 0, 2, 2, 2], + [ 2, 23, 15, 36, 3, 3, 3, 2, 0], + [ 2, 25, 10, 2, 1, 0, 2, 2, 5], + [ 2, 21, 7, 4, 1, 0, 2, 2, 1], + [ 1, 18, 17, 41, 3, 3, 3, 2, 0], + [ 2, 18, 17, 41, 3, 3, 3, 2, 0], + [ 6, 9, 1, 0, 3, 3, 3, 2, 0], + [ 1, 7, 20, 47, 3, 3, 3, 2, 0], + [ 2, 25, 10, 7, 0, 1, 2, 2, 2], + [ 7, 0, 4, 32, 1, 2, 0, 2, 5], + [ 1, 12, 12, 15, 0, 1, 2, 3, 3], + [ 2, 26, 15, 25, 0, 1, 2, 0, 5], + [ 2, 20, 15, 19, 0, 1, 2, 2, 1], + [ 4, 6, 9, 11, 2, 0, 1, 1, 4], + [ 2, 13, 15, 42, 0, 2, 1, 2, 2], + [ 3, 5, 21, 31, 0, 1, 2, 3, 5], + [ 2, 13, 19, 33, 0, 2, 1, 2, 2], + [ 1, 11, 10, 0, 0, 2, 1, 0, 2] +]) + +SOYBEAN5 = np.array([ + [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D1'], + [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 'D2'], + [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 'D3'], + [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 1, 'D4'], +]) +# Drop target column +SOYBEAN5 = SOYBEAN5[:, :35] + +SOYBEAN6 = np.array([ + [ 2, 22, 14, 45, 2, 0, 1, 2, 5], + [ 7, 13, 13, 19, 2, 0, 1, 2, 5], + [ 5, 18, 19, 33, 0, 2, 1, 2, 2], + [ 1, 11, 10, 0, 0, 2, 1, 0, 2] +]) + def assert_cluster_splits_equal(array1, array2): @@ -334,6 +542,72 @@ def test_kmodes_nunique_nclusters_ng(self): np.array([[0, 2], [0, 1]])) + def test_kmodes_huang_soybean_jaccard_dissim_binary(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_binary, random_state=42) + result = kmodes_huang.fit_predict(SOYBEAN3) + expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, + 3, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_cao_soybean_jaccard_dissim_binary(self): + kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, + cat_dissim=jaccard_dissim_binary, random_state=42) + result = kmodes_Cao.fit_predict(SOYBEAN3) + expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2, + 2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_predict_soybean_jaccard_dissim_binary(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_binary, random_state=42) + kmodes_huang =kmodes_huang.fit(SOYBEAN3) + result = kmodes_huang.fit_predict(SOYBEAN5) + expected = np.array([1, 0, 1, 1]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + + def test_kmodes_huang_soybean_jaccard_dissim_label(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_label, random_state=42) + result = kmodes_huang.fit_predict(SOYBEAN4) + expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3, + 0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1, + 1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1, + 0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3, + 3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_cao_soybean_jaccard_dissim_label(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, + cat_dissim=jaccard_dissim_label, random_state=42) + result = kmodes_huang.fit_predict(SOYBEAN4) + expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, + 1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1, + 0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]) + + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_predict_soybean_jaccard_dissim_label(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_label, random_state=42) + kmodes_huang =kmodes_huang.fit(SOYBEAN4) + result = kmodes_huang.fit_predict(SOYBEAN6) + expected = np.array([0, 1, 0, 3]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_ninit(self): kmodes = KModes(n_init=10, init='Huang') self.assertEqual(kmodes.n_init, 10) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index 4f27471..ecdfcd5 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -10,6 +10,27 @@ def matching_dissim(a, b, **_): return np.sum(a != b, axis=1) +def jaccard_dissim_binary(a, b, **__): + """Jaccard dissimilarity function for biinary encoded variables""" + if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2: + raise ValueError("Missing or non Binary values detected in Binary columns.") + return np.sum(np.bitwise_and(a, b), axis=1)/np.sum(np.bitwise_or(a, b), axis=1) + + +def jaccard_dissim_label(a, b, **__): + """Jaccard dissimilarity function for label encoded variables""" + if (a.astype(int) < 0).any() or (b.astype(int) < 0).any(): + raise ValueError("Missing values detected in Numeric columns.") + intersect_len = np.empty(len(a), dtype=int) + union_len = np.empty(len(a), dtype=int) + i = 0 + for row in a: + intersect_len[i] = len(np.intersect1d(row, b)) + union_len[i] = len(np.union1d(row, b)) + i = i+1 + return intersect_len/union_len + + def euclidean_dissim(a, b, **_): """Euclidean distance dissimilarity function""" if np.isnan(a).any() or np.isnan(b).any(): diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index 4d8404c..d6b1f35 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -7,7 +7,7 @@ import numpy as np from sklearn.utils.testing import assert_equal, assert_array_equal -from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim +from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim, jaccard_dissim_binary, jaccard_dissim_label class TestDissimilarityMeasures(unittest.TestCase): @@ -25,6 +25,27 @@ def test_matching_dissim(self): b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']]) assert_array_equal(np.array([0, 4]), matching_dissim(a, b)) + def test_jaccard_dissim_binary(self): + a = np.array([[0, 1, 1, 0, 1, 1]]) + b = np.array([[0, 1, 1, 0, 1, 0]]) + assert_equal(0.75, jaccard_dissim_binary(a, b)) + + a = np.array([[0, 1, 1, 0, 1, 1]]) + b = np.array([[0, np.NaN, 1, 0, 1, 0]]) + with self.assertRaises(ValueError): + jaccard_dissim_binary(a, b) + + def test_jaccard_dissim_label(self): + a = np.array([[0, 1, 2, 0, 1, 2]]) + b = np.array([[0, 1, 2, 0, 1, 0]]) + assert_equal(1, jaccard_dissim_label(a, b)) + + a = np.array([[np.NaN, 1, 2, 0, 1, 2]]) + b = np.array([[0, 1, 2, 0, 1, 0]]) + with self.assertRaises(ValueError): + jaccard_dissim_label(a, b) + + def test_euclidian_dissim(self): a = np.array([[0., 1., 2., 0., 1., 2.]]) b = np.array([[3., 1., 3., 0., 1., 0.]]) From 3c18f3b115f334b7c8219a95901a8eae35d56ce8 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Sun, 21 Jul 2019 14:20:45 +0530 Subject: [PATCH 02/12] Resolving errors in Quality check --- kmodes/tests/test_kmodes.py | 170 +++++++++++++++---------------- kmodes/util/dissim.py | 6 +- kmodes/util/tests/test_dissim.py | 3 +- 3 files changed, 90 insertions(+), 89 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 6f7f3bd..1bc6a5d 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -225,91 +225,91 @@ SOYBEAN3 = SOYBEAN3[:, :35] SOYBEAN4 = np.array([ - [ 2, 22, 14, 45, 2, 0, 1, 2, 5], - [ 2, 13, 13, 19, 2, 0, 1, 2, 5], - [ 3, 25, 4, 3, 0, 1, 2, 0, 4], - [ 2, 13, 15, 18, 0, 1, 2, 2, 3], - [ 3, 10, 4, 42, 0, 2, 1, 1, 2], - [ 2, 16, 21, 14, 0, 1, 2, 2, 2], - [ 2, 16, 19, 37, 0, 2, 1, 2, 2], - [ 2, 20, 9, 34, 0, 1, 2, 3, 5], - [ 2, 14, 21, 44, 0, 1, 2, 3, 2], - [ 2, 26, 5, 30, 0, 1, 2, 3, 3], - [ 3, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 20, 1, 27, 3, 3, 3, 2, 0], - [ 3, 6, 8, 19, 0, 1, 2, 1, 2], - [ 2, 13, 8, 41, 3, 3, 3, 2, 0], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 16, 19, 42, 0, 1, 2, 2, 5], - [ 7, 7, 5, 43, 0, 2, 1, 2, 2], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 3, 3, 5, 12, 3, 3, 3, 2, 0], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 7, 15, 19, 17, 0, 1, 2, 2, 2], - [ 1, 1, 15, 24, 0, 1, 2, 2, 2], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 5, 7, 9, 0, 1, 2, 3, 5], - [ 2, 24, 6, 10, 0, 2, 1, 2, 2], - [ 2, 13, 16, 29, 0, 2, 1, 2, 2], - [ 3, 6, 8, 1, 0, 1, 2, 2, 5], - [ 2, 16, 15, 34, 0, 1, 2, 2, 1], - [ 0, 24, 14, 12, 3, 3, 3, 2, 0], - [ 3, 8, 21, 13, 3, 3, 3, 2, 0], - [ 2, 17, 15, 42, 3, 3, 3, 2, 0], - [ 2, 25, 18, 16, 3, 3, 3, 2, 0], - [ 2, 3, 15, 42, 3, 3, 3, 2, 0], - [ 6, 13, 15, 22, 3, 3, 3, 2, 0], - [ 3, 8, 18, 24, 1, 0, 2, 2, 5], - [ 7, 20, 15, 26, 1, 0, 2, 2, 1], - [ 2, 20, 7, 35, 0, 1, 2, 2, 5], - [ 2, 16, 12, 28, 0, 1, 2, 2, 5], - [ 2, 16, 5, 39, 0, 1, 2, 2, 2], - [ 3, 6, 11, 8, 0, 1, 2, 2, 2], - [ 7, 6, 15, 44, 1, 0, 2, 2, 4], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 16, 7, 6, 3, 3, 3, 2, 0], - [ 1, 13, 2, 46, 3, 3, 3, 2, 0], - [ 0, 14, 5, 41, 3, 3, 3, 2, 0], - [ 2, 24, 19, 0, 3, 3, 3, 2, 0], - [ 2, 14, 3, 35, 3, 3, 3, 2, 0], - [ 6, 19, 7, 5, 0, 2, 1, 2, 2], - [ 5, 6, 11, 44, 3, 3, 3, 2, 0], - [ 7, 16, 21, 21, 3, 3, 3, 2, 0], - [ 2, 19, 7, 44, 3, 3, 3, 2, 0], - [ 2, 24, 18, 33, 1, 0, 2, 1, 4], - [ 2, 16, 8, 44, 0, 2, 1, 2, 1], - [ 3, 2, 5, 15, 0, 1, 2, 2, 2], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 4, 15, 47, 0, 1, 2, 2, 2], - [ 7, 13, 15, 25, 0, 1, 2, 2, 1], - [ 1, 19, 10, 15, 3, 3, 3, 2, 0], - [ 2, 13, 5, 44, 0, 1, 2, 1, 2], - [ 5, 11, 18, 20, 3, 3, 3, 2, 0], - [ 7, 9, 5, 40, 0, 1, 2, 1, 4], - [ 3, 6, 16, 38, 3, 3, 3, 2, 0], - [ 2, 24, 22, 12, 0, 1, 2, 2, 3], - [ 5, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 16, 15, 23, 0, 1, 2, 2, 5], - [ 2, 13, 0, 25, 1, 0, 2, 2, 2], - [ 2, 23, 15, 36, 3, 3, 3, 2, 0], - [ 2, 25, 10, 2, 1, 0, 2, 2, 5], - [ 2, 21, 7, 4, 1, 0, 2, 2, 1], - [ 1, 18, 17, 41, 3, 3, 3, 2, 0], - [ 2, 18, 17, 41, 3, 3, 3, 2, 0], - [ 6, 9, 1, 0, 3, 3, 3, 2, 0], - [ 1, 7, 20, 47, 3, 3, 3, 2, 0], - [ 2, 25, 10, 7, 0, 1, 2, 2, 2], - [ 7, 0, 4, 32, 1, 2, 0, 2, 5], - [ 1, 12, 12, 15, 0, 1, 2, 3, 3], - [ 2, 26, 15, 25, 0, 1, 2, 0, 5], - [ 2, 20, 15, 19, 0, 1, 2, 2, 1], - [ 4, 6, 9, 11, 2, 0, 1, 1, 4], - [ 2, 13, 15, 42, 0, 2, 1, 2, 2], - [ 3, 5, 21, 31, 0, 1, 2, 3, 5], - [ 2, 13, 19, 33, 0, 2, 1, 2, 2], - [ 1, 11, 10, 0, 0, 2, 1, 0, 2] + [2, 22, 14, 45, 2, 0, 1, 2, 5], + [2, 13, 13, 19, 2, 0, 1, 2, 5], + [3, 25, 4, 3, 0, 1, 2, 0, 4], + [2, 13, 15, 18, 0, 1, 2, 2, 3], + [3, 10, 4, 42, 0, 2, 1, 1, 2], + [2, 16, 21, 14, 0, 1, 2, 2, 2], + [2, 16, 19, 37, 0, 2, 1, 2, 2], + [2, 20, 9, 34, 0, 1, 2, 3, 5], + [2, 14, 21, 44, 0, 1, 2, 3, 2], + [2, 26, 5, 30, 0, 1, 2, 3, 3], + [3, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 20, 1, 27, 3, 3, 3, 2, 0], + [3, 6, 8, 19, 0, 1, 2, 1, 2], + [2, 13, 8, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 19, 42, 0, 1, 2, 2, 5], + [7, 7, 5, 43, 0, 2, 1, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [3, 3, 5, 12, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [7, 15, 19, 17, 0, 1, 2, 2, 2], + [1, 1, 15, 24, 0, 1, 2, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 5, 7, 9, 0, 1, 2, 3, 5], + [2, 24, 6, 10, 0, 2, 1, 2, 2], + [2, 13, 16, 29, 0, 2, 1, 2, 2], + [3, 6, 8, 1, 0, 1, 2, 2, 5], + [2, 16, 15, 34, 0, 1, 2, 2, 1], + [0, 24, 14, 12, 3, 3, 3, 2, 0], + [3, 8, 21, 13, 3, 3, 3, 2, 0], + [2, 17, 15, 42, 3, 3, 3, 2, 0], + [2, 25, 18, 16, 3, 3, 3, 2, 0], + [2, 3, 15, 42, 3, 3, 3, 2, 0], + [6, 13, 15, 22, 3, 3, 3, 2, 0], + [3, 8, 18, 24, 1, 0, 2, 2, 5], + [7, 20, 15, 26, 1, 0, 2, 2, 1], + [2, 20, 7, 35, 0, 1, 2, 2, 5], + [2, 16, 12, 28, 0, 1, 2, 2, 5], + [2, 16, 5, 39, 0, 1, 2, 2, 2], + [3, 6, 11, 8, 0, 1, 2, 2, 2], + [7, 6, 15, 44, 1, 0, 2, 2, 4], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 7, 6, 3, 3, 3, 2, 0], + [1, 13, 2, 46, 3, 3, 3, 2, 0], + [0, 14, 5, 41, 3, 3, 3, 2, 0], + [2, 24, 19, 0, 3, 3, 3, 2, 0], + [2, 14, 3, 35, 3, 3, 3, 2, 0], + [6, 19, 7, 5, 0, 2, 1, 2, 2], + [5, 6, 11, 44, 3, 3, 3, 2, 0], + [7, 16, 21, 21, 3, 3, 3, 2, 0], + [2, 19, 7, 44, 3, 3, 3, 2, 0], + [2, 24, 18, 33, 1, 0, 2, 1, 4], + [2, 16, 8, 44, 0, 2, 1, 2, 1], + [3, 2, 5, 15, 0, 1, 2, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 4, 15, 47, 0, 1, 2, 2, 2], + [7, 13, 15, 25, 0, 1, 2, 2, 1], + [1, 19, 10, 15, 3, 3, 3, 2, 0], + [2, 13, 5, 44, 0, 1, 2, 1, 2], + [5, 11, 18, 20, 3, 3, 3, 2, 0], + [7, 9, 5, 40, 0, 1, 2, 1, 4], + [3, 6, 16, 38, 3, 3, 3, 2, 0], + [2, 24, 22, 12, 0, 1, 2, 2, 3], + [5, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 15, 23, 0, 1, 2, 2, 5], + [2, 13, 0, 25, 1, 0, 2, 2, 2], + [2, 23, 15, 36, 3, 3, 3, 2, 0], + [2, 25, 10, 2, 1, 0, 2, 2, 5], + [2, 21, 7, 4, 1, 0, 2, 2, 1], + [1, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [6, 9, 1, 0, 3, 3, 3, 2, 0], + [1, 7, 20, 47, 3, 3, 3, 2, 0], + [2, 25, 10, 7, 0, 1, 2, 2, 2], + [7, 0, 4, 32, 1, 2, 0, 2, 5], + [1, 12, 12, 15, 0, 1, 2, 3, 3], + [2, 26, 15, 25, 0, 1, 2, 0, 5], + [2, 20, 15, 19, 0, 1, 2, 2, 1], + [4, 6, 9, 11, 2, 0, 1, 1, 4], + [2, 13, 15, 42, 0, 2, 1, 2, 2], + [3, 5, 21, 31, 0, 1, 2, 3, 5], + [2, 13, 19, 33, 0, 2, 1, 2, 2], + [1, 11, 10, 0, 0, 2, 1, 0, 2] ]) SOYBEAN5 = np.array([ diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index ecdfcd5..1b81bc8 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -14,7 +14,7 @@ def jaccard_dissim_binary(a, b, **__): """Jaccard dissimilarity function for biinary encoded variables""" if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2: raise ValueError("Missing or non Binary values detected in Binary columns.") - return np.sum(np.bitwise_and(a, b), axis=1)/np.sum(np.bitwise_or(a, b), axis=1) + return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1) def jaccard_dissim_label(a, b, **__): @@ -27,8 +27,8 @@ def jaccard_dissim_label(a, b, **__): for row in a: intersect_len[i] = len(np.intersect1d(row, b)) union_len[i] = len(np.union1d(row, b)) - i = i+1 - return intersect_len/union_len + i = i + 1 + return intersect_len / union_len def euclidean_dissim(a, b, **_): diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index d6b1f35..fd713af 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -7,7 +7,8 @@ import numpy as np from sklearn.utils.testing import assert_equal, assert_array_equal -from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim, jaccard_dissim_binary, jaccard_dissim_label +from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim +from kmodes.util.dissim import jaccard_dissim_binary, jaccard_dissim_label class TestDissimilarityMeasures(unittest.TestCase): From c3741dfc81be273cd93a6538d247ad24422b5cc8 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Sun, 21 Jul 2019 14:28:11 +0530 Subject: [PATCH 03/12] Resolving errors in Quality check --- kmodes/tests/test_kmodes.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 1bc6a5d..2030294 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -326,10 +326,10 @@ SOYBEAN5 = SOYBEAN5[:, :35] SOYBEAN6 = np.array([ - [ 2, 22, 14, 45, 2, 0, 1, 2, 5], - [ 7, 13, 13, 19, 2, 0, 1, 2, 5], - [ 5, 18, 19, 33, 0, 2, 1, 2, 2], - [ 1, 11, 10, 0, 0, 2, 1, 0, 2] + [2, 22, 14, 45, 2, 0, 1, 2, 5], + [7, 13, 13, 19, 2, 0, 1, 2, 5], + [5, 18, 19, 33, 0, 2, 1, 2, 2], + [1, 11, 10, 0, 0, 2, 1, 0, 2] ]) @@ -554,7 +554,7 @@ def test_kmodes_huang_soybean_jaccard_dissim_binary(self): def test_kmodes_cao_soybean_jaccard_dissim_binary(self): kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, - cat_dissim=jaccard_dissim_binary, random_state=42) + cat_dissim=jaccard_dissim_binary, random_state=42) result = kmodes_Cao.fit_predict(SOYBEAN3) expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, @@ -566,13 +566,12 @@ def test_kmodes_cao_soybean_jaccard_dissim_binary(self): def test_kmodes_predict_soybean_jaccard_dissim_binary(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_binary, random_state=42) - kmodes_huang =kmodes_huang.fit(SOYBEAN3) + kmodes_huang = kmodes_huang.fit(SOYBEAN3) result = kmodes_huang.fit_predict(SOYBEAN5) expected = np.array([1, 0, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) - def test_kmodes_huang_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) From f8dbc153f5140f72600a50ececfc6f98f86ff6ba Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Sun, 21 Jul 2019 14:35:27 +0530 Subject: [PATCH 04/12] Resolving errors in Quality check --- kmodes/tests/test_kmodes.py | 172 ++++++++++++++++++------------------ 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 2030294..e3af29d 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -225,91 +225,91 @@ SOYBEAN3 = SOYBEAN3[:, :35] SOYBEAN4 = np.array([ - [2, 22, 14, 45, 2, 0, 1, 2, 5], - [2, 13, 13, 19, 2, 0, 1, 2, 5], - [3, 25, 4, 3, 0, 1, 2, 0, 4], - [2, 13, 15, 18, 0, 1, 2, 2, 3], - [3, 10, 4, 42, 0, 2, 1, 1, 2], - [2, 16, 21, 14, 0, 1, 2, 2, 2], - [2, 16, 19, 37, 0, 2, 1, 2, 2], - [2, 20, 9, 34, 0, 1, 2, 3, 5], - [2, 14, 21, 44, 0, 1, 2, 3, 2], - [2, 26, 5, 30, 0, 1, 2, 3, 3], - [3, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 20, 1, 27, 3, 3, 3, 2, 0], - [3, 6, 8, 19, 0, 1, 2, 1, 2], - [2, 13, 8, 41, 3, 3, 3, 2, 0], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 16, 19, 42, 0, 1, 2, 2, 5], - [7, 7, 5, 43, 0, 2, 1, 2, 2], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [3, 3, 5, 12, 3, 3, 3, 2, 0], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [7, 15, 19, 17, 0, 1, 2, 2, 2], - [1, 1, 15, 24, 0, 1, 2, 2, 2], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 5, 7, 9, 0, 1, 2, 3, 5], - [2, 24, 6, 10, 0, 2, 1, 2, 2], - [2, 13, 16, 29, 0, 2, 1, 2, 2], - [3, 6, 8, 1, 0, 1, 2, 2, 5], - [2, 16, 15, 34, 0, 1, 2, 2, 1], - [0, 24, 14, 12, 3, 3, 3, 2, 0], - [3, 8, 21, 13, 3, 3, 3, 2, 0], - [2, 17, 15, 42, 3, 3, 3, 2, 0], - [2, 25, 18, 16, 3, 3, 3, 2, 0], - [2, 3, 15, 42, 3, 3, 3, 2, 0], - [6, 13, 15, 22, 3, 3, 3, 2, 0], - [3, 8, 18, 24, 1, 0, 2, 2, 5], - [7, 20, 15, 26, 1, 0, 2, 2, 1], - [2, 20, 7, 35, 0, 1, 2, 2, 5], - [2, 16, 12, 28, 0, 1, 2, 2, 5], - [2, 16, 5, 39, 0, 1, 2, 2, 2], - [3, 6, 11, 8, 0, 1, 2, 2, 2], - [7, 6, 15, 44, 1, 0, 2, 2, 4], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 16, 7, 6, 3, 3, 3, 2, 0], - [1, 13, 2, 46, 3, 3, 3, 2, 0], - [0, 14, 5, 41, 3, 3, 3, 2, 0], - [2, 24, 19, 0, 3, 3, 3, 2, 0], - [2, 14, 3, 35, 3, 3, 3, 2, 0], - [6, 19, 7, 5, 0, 2, 1, 2, 2], - [5, 6, 11, 44, 3, 3, 3, 2, 0], - [7, 16, 21, 21, 3, 3, 3, 2, 0], - [2, 19, 7, 44, 3, 3, 3, 2, 0], - [2, 24, 18, 33, 1, 0, 2, 1, 4], - [2, 16, 8, 44, 0, 2, 1, 2, 1], - [3, 2, 5, 15, 0, 1, 2, 2, 2], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 4, 15, 47, 0, 1, 2, 2, 2], - [7, 13, 15, 25, 0, 1, 2, 2, 1], - [1, 19, 10, 15, 3, 3, 3, 2, 0], - [2, 13, 5, 44, 0, 1, 2, 1, 2], - [5, 11, 18, 20, 3, 3, 3, 2, 0], - [7, 9, 5, 40, 0, 1, 2, 1, 4], - [3, 6, 16, 38, 3, 3, 3, 2, 0], - [2, 24, 22, 12, 0, 1, 2, 2, 3], - [5, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 16, 15, 23, 0, 1, 2, 2, 5], - [2, 13, 0, 25, 1, 0, 2, 2, 2], - [2, 23, 15, 36, 3, 3, 3, 2, 0], - [2, 25, 10, 2, 1, 0, 2, 2, 5], - [2, 21, 7, 4, 1, 0, 2, 2, 1], - [1, 18, 17, 41, 3, 3, 3, 2, 0], - [2, 18, 17, 41, 3, 3, 3, 2, 0], - [6, 9, 1, 0, 3, 3, 3, 2, 0], - [1, 7, 20, 47, 3, 3, 3, 2, 0], - [2, 25, 10, 7, 0, 1, 2, 2, 2], - [7, 0, 4, 32, 1, 2, 0, 2, 5], - [1, 12, 12, 15, 0, 1, 2, 3, 3], - [2, 26, 15, 25, 0, 1, 2, 0, 5], - [2, 20, 15, 19, 0, 1, 2, 2, 1], - [4, 6, 9, 11, 2, 0, 1, 1, 4], - [2, 13, 15, 42, 0, 2, 1, 2, 2], - [3, 5, 21, 31, 0, 1, 2, 3, 5], - [2, 13, 19, 33, 0, 2, 1, 2, 2], - [1, 11, 10, 0, 0, 2, 1, 0, 2] + [2, 22, 14, 45, 2, 0, 1, 2, 5], + [2, 13, 13, 19, 2, 0, 1, 2, 5], + [3, 25, 4, 3, 0, 1, 2, 0, 4], + [2, 13, 15, 18, 0, 1, 2, 2, 3], + [3, 10, 4, 42, 0, 2, 1, 1, 2], + [2, 16, 21, 14, 0, 1, 2, 2, 2], + [2, 16, 19, 37, 0, 2, 1, 2, 2], + [2, 20, 9, 34, 0, 1, 2, 3, 5], + [2, 14, 21, 44, 0, 1, 2, 3, 2], + [2, 26, 5, 30, 0, 1, 2, 3, 3], + [3, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 20, 1, 27, 3, 3, 3, 2, 0], + [3, 6, 8, 19, 0, 1, 2, 1, 2], + [2, 13, 8, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 19, 42, 0, 1, 2, 2, 5], + [7, 7, 5, 43, 0, 2, 1, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [3, 3, 5, 12, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [7, 15, 19, 17, 0, 1, 2, 2, 2], + [1, 1, 15, 24, 0, 1, 2, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 5, 7, 9, 0, 1, 2, 3, 5], + [2, 24, 6, 10, 0, 2, 1, 2, 2], + [2, 13, 16, 29, 0, 2, 1, 2, 2], + [3, 6, 8, 1, 0, 1, 2, 2, 5], + [2, 16, 15, 34, 0, 1, 2, 2, 1], + [0, 24, 14, 12, 3, 3, 3, 2, 0], + [3, 8, 21, 13, 3, 3, 3, 2, 0], + [2, 17, 15, 42, 3, 3, 3, 2, 0], + [2, 25, 18, 16, 3, 3, 3, 2, 0], + [2, 3, 15, 42, 3, 3, 3, 2, 0], + [6, 13, 15, 22, 3, 3, 3, 2, 0], + [3, 8, 18, 24, 1, 0, 2, 2, 5], + [7, 20, 15, 26, 1, 0, 2, 2, 1], + [2, 20, 7, 35, 0, 1, 2, 2, 5], + [2, 16, 12, 28, 0, 1, 2, 2, 5], + [2, 16, 5, 39, 0, 1, 2, 2, 2], + [3, 6, 11, 8, 0, 1, 2, 2, 2], + [7, 6, 15, 44, 1, 0, 2, 2, 4], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 7, 6, 3, 3, 3, 2, 0], + [1, 13, 2, 46, 3, 3, 3, 2, 0], + [0, 14, 5, 41, 3, 3, 3, 2, 0], + [2, 24, 19, 0, 3, 3, 3, 2, 0], + [2, 14, 3, 35, 3, 3, 3, 2, 0], + [6, 19, 7, 5, 0, 2, 1, 2, 2], + [5, 6, 11, 44, 3, 3, 3, 2, 0], + [7, 16, 21, 21, 3, 3, 3, 2, 0], + [2, 19, 7, 44, 3, 3, 3, 2, 0], + [2, 24, 18, 33, 1, 0, 2, 1, 4], + [2, 16, 8, 44, 0, 2, 1, 2, 1], + [3, 2, 5, 15, 0, 1, 2, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 4, 15, 47, 0, 1, 2, 2, 2], + [7, 13, 15, 25, 0, 1, 2, 2, 1], + [1, 19, 10, 15, 3, 3, 3, 2, 0], + [2, 13, 5, 44, 0, 1, 2, 1, 2], + [5, 11, 18, 20, 3, 3, 3, 2, 0], + [7, 9, 5, 40, 0, 1, 2, 1, 4], + [3, 6, 16, 38, 3, 3, 3, 2, 0], + [2, 24, 22, 12, 0, 1, 2, 2, 3], + [5, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 15, 23, 0, 1, 2, 2, 5], + [2, 13, 0, 25, 1, 0, 2, 2, 2], + [2, 23, 15, 36, 3, 3, 3, 2, 0], + [2, 25, 10, 2, 1, 0, 2, 2, 5], + [2, 21, 7, 4, 1, 0, 2, 2, 1], + [1, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [6, 9, 1, 0, 3, 3, 3, 2, 0], + [1, 7, 20, 47, 3, 3, 3, 2, 0], + [2, 25, 10, 7, 0, 1, 2, 2, 2], + [7, 0, 4, 32, 1, 2, 0, 2, 5], + [1, 12, 12, 15, 0, 1, 2, 3, 3], + [2, 26, 15, 25, 0, 1, 2, 0, 5], + [2, 20, 15, 19, 0, 1, 2, 2, 1], + [4, 6, 9, 11, 2, 0, 1, 1, 4], + [2, 13, 15, 42, 0, 2, 1, 2, 2], + [3, 5, 21, 31, 0, 1, 2, 3, 5], + [2, 13, 19, 33, 0, 2, 1, 2, 2], + [1, 11, 10, 0, 0, 2, 1, 0, 2] ]) SOYBEAN5 = np.array([ @@ -600,7 +600,7 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self): def test_kmodes_predict_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) - kmodes_huang =kmodes_huang.fit(SOYBEAN4) + kmodes_huang = kmodes_huang.fit(SOYBEAN4) result = kmodes_huang.fit_predict(SOYBEAN6) expected = np.array([0, 1, 0, 3]) assert_cluster_splits_equal(result, expected) From 0bcc371847ab14c3e99ddf91d1b7b042111fde85 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Sun, 21 Jul 2019 14:38:46 +0530 Subject: [PATCH 05/12] Resolving errors in Quality check --- kmodes/tests/test_kmodes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index e3af29d..1420fd5 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -326,10 +326,10 @@ SOYBEAN5 = SOYBEAN5[:, :35] SOYBEAN6 = np.array([ - [2, 22, 14, 45, 2, 0, 1, 2, 5], - [7, 13, 13, 19, 2, 0, 1, 2, 5], - [5, 18, 19, 33, 0, 2, 1, 2, 2], - [1, 11, 10, 0, 0, 2, 1, 0, 2] + [2, 22, 14, 45, 2, 0, 1, 2, 5], + [7, 13, 13, 19, 2, 0, 1, 2, 5], + [5, 18, 19, 33, 0, 2, 1, 2, 2], + [1, 11, 10, 0, 0, 2, 1, 0, 2] ]) From 44fcc68888ceab974288573ab6d115c2e5165947 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Sun, 21 Jul 2019 16:32:12 +0530 Subject: [PATCH 06/12] Resolving Build errors for python ver 3.6.0 --- kmodes/tests/test_kmodes.py | 22 +++++++++++----------- kmodes/util/dissim.py | 3 ++- kmodes/util/tests/test_dissim.py | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 1420fd5..a09bf6c 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -576,11 +576,11 @@ def test_kmodes_huang_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) result = kmodes_huang.fit_predict(SOYBEAN4) - expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3, - 0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1, - 1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1, - 0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3, - 3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1]) + expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, + 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, + 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -588,11 +588,11 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) result = kmodes_huang.fit_predict(SOYBEAN4) - expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, - 1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1, - 0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]) + expected = np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 1, 0, + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, + 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -602,7 +602,7 @@ def test_kmodes_predict_soybean_jaccard_dissim_label(self): cat_dissim=jaccard_dissim_label, random_state=42) kmodes_huang = kmodes_huang.fit(SOYBEAN4) result = kmodes_huang.fit_predict(SOYBEAN6) - expected = np.array([0, 1, 0, 3]) + expected = np.array([0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index 1b81bc8..f2d3df2 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -26,7 +26,8 @@ def jaccard_dissim_label(a, b, **__): i = 0 for row in a: intersect_len[i] = len(np.intersect1d(row, b)) - union_len[i] = len(np.union1d(row, b)) + union_len[i] = len(row) + len(b) - intersect_len[i] + # union_len[i] = np.unique(np.concatenate((row, b))) i = i + 1 return intersect_len / union_len diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index fd713af..2db5edf 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -39,7 +39,7 @@ def test_jaccard_dissim_binary(self): def test_jaccard_dissim_label(self): a = np.array([[0, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 1, 0]]) - assert_equal(1, jaccard_dissim_label(a, b)) + assert_equal(0.75, jaccard_dissim_label(a, b)) a = np.array([[np.NaN, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 1, 0]]) From 8e97e631da24177368b9f6ae7cd4b5d4f2baa304 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Tue, 23 Jul 2019 11:51:22 +0530 Subject: [PATCH 07/12] Added some recommended changes --- kmodes/tests/test_kmodes.py | 141 ++++--------------------------- kmodes/util/dissim.py | 13 ++- kmodes/util/tests/test_dissim.py | 6 ++ 3 files changed, 30 insertions(+), 130 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index a09bf6c..2e43f9a 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -124,106 +124,6 @@ # Drop target column SOYBEAN2 = SOYBEAN2[:, :35] -# SOYBEAN Binary encoded -SOYBEAN3 = np.array([ - [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D3'], - [1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], - [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 1, 'D4'], -]) -# Drop target column -SOYBEAN3 = SOYBEAN3[:, :35] - SOYBEAN4 = np.array([ [2, 22, 14, 45, 2, 0, 1, 2, 5], [2, 13, 13, 19, 2, 0, 1, 2, 5], @@ -312,19 +212,6 @@ [1, 11, 10, 0, 0, 2, 1, 0, 2] ]) -SOYBEAN5 = np.array([ - [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D1'], - [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 0, 0, 'D2'], - [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 'D3'], - [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 1, 'D4'], -]) -# Drop target column -SOYBEAN5 = SOYBEAN5[:, :35] - SOYBEAN6 = np.array([ [2, 22, 14, 45, 2, 0, 1, 2, 5], [7, 13, 13, 19, 2, 0, 1, 2, 5], @@ -545,19 +432,23 @@ def test_kmodes_nunique_nclusters_ng(self): def test_kmodes_huang_soybean_jaccard_dissim_binary(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_binary, random_state=42) - result = kmodes_huang.fit_predict(SOYBEAN3) - expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, - 3, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + # binary encoded variables are required + bin_variables = SOYBEAN.astype(bool).astype(int) + result = kmodes_huang.fit_predict(bin_variables) + expected = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, + 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) def test_kmodes_cao_soybean_jaccard_dissim_binary(self): kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, cat_dissim=jaccard_dissim_binary, random_state=42) - result = kmodes_Cao.fit_predict(SOYBEAN3) - expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2, - 2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, + # binary encoded variables are required + bin_variables = SOYBEAN.astype(bool).astype(int) + result = kmodes_Cao.fit_predict(bin_variables) + expected = np.array([3, 2, 2, 3, 3, 2, 3, 2, 2, 3, 1, 2, 1, 2, 1, 2, 1, + 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) @@ -566,9 +457,13 @@ def test_kmodes_cao_soybean_jaccard_dissim_binary(self): def test_kmodes_predict_soybean_jaccard_dissim_binary(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_binary, random_state=42) - kmodes_huang = kmodes_huang.fit(SOYBEAN3) - result = kmodes_huang.fit_predict(SOYBEAN5) - expected = np.array([1, 0, 1, 1]) + # binary encoded variables are required + bin_variables = SOYBEAN.astype(bool).astype(int) + kmodes_huang = kmodes_huang.fit(bin_variables) + # binary encoded variables required for prediction as well + bin_variables_pred = SOYBEAN2.astype(bool).astype(int) + result = kmodes_huang.fit_predict(bin_variables_pred) + expected = np.array([1, 2, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index f2d3df2..4c28504 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -11,15 +11,15 @@ def matching_dissim(a, b, **_): def jaccard_dissim_binary(a, b, **__): - """Jaccard dissimilarity function for biinary encoded variables""" - if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2: - raise ValueError("Missing or non Binary values detected in Binary columns.") - return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1) + """Jaccard dissimilarity function for binary encoded variables""" + if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all(): + return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1) + raise ValueError("Missing or non Binary values detected in Binary columns.") def jaccard_dissim_label(a, b, **__): """Jaccard dissimilarity function for label encoded variables""" - if (a.astype(int) < 0).any() or (b.astype(int) < 0).any(): + if np.isnan(np.array(a, dtype=np.float64)).any() or np.isnan(np.array(b, dtype=np.float64)).any(): raise ValueError("Missing values detected in Numeric columns.") intersect_len = np.empty(len(a), dtype=int) union_len = np.empty(len(a), dtype=int) @@ -27,8 +27,7 @@ def jaccard_dissim_label(a, b, **__): for row in a: intersect_len[i] = len(np.intersect1d(row, b)) union_len[i] = len(row) + len(b) - intersect_len[i] - # union_len[i] = np.unique(np.concatenate((row, b))) - i = i + 1 + i += 1 return intersect_len / union_len diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index 2db5edf..826de7e 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -36,6 +36,12 @@ def test_jaccard_dissim_binary(self): with self.assertRaises(ValueError): jaccard_dissim_binary(a, b) + # test where values are non binary but also not having np.NaN + a = np.array([[0, 1, 2, 0, 1, 2]]) + b = np.array([[0, 1, 2, 0, 1, 0]]) + with self.assertRaises(ValueError): + jaccard_dissim_binary(a, b) + def test_jaccard_dissim_label(self): a = np.array([[0, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 1, 0]]) From 45078dd070499319e3bc5ac1d6a5e663360bfa67 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Tue, 23 Jul 2019 12:03:56 +0530 Subject: [PATCH 08/12] Resolving errors in Quality Check --- kmodes/util/dissim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index 4c28504..e5a5cf9 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -19,7 +19,7 @@ def jaccard_dissim_binary(a, b, **__): def jaccard_dissim_label(a, b, **__): """Jaccard dissimilarity function for label encoded variables""" - if np.isnan(np.array(a, dtype=np.float64)).any() or np.isnan(np.array(b, dtype=np.float64)).any(): + if np.isnan(a.astype('float64')).any() or np.isnan(b.astype('float64')).any(): raise ValueError("Missing values detected in Numeric columns.") intersect_len = np.empty(len(a), dtype=int) union_len = np.empty(len(a), dtype=int) From 71ade685f6ee2452ca757c7f09d79e15a41a3b48 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Fri, 26 Jul 2019 18:48:08 +0530 Subject: [PATCH 09/12] added suggested changes and corrected logic error --- kmodes/tests/test_kmodes.py | 35 ++++++++++++++++---------------- kmodes/util/dissim.py | 4 +++- kmodes/util/tests/test_dissim.py | 23 ++++++++++++++++++++- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 2e43f9a..e93a9a9 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -124,7 +124,8 @@ # Drop target column SOYBEAN2 = SOYBEAN2[:, :35] -SOYBEAN4 = np.array([ +# test data with categorical variables that have been label encoded +TEST_DATA = np.array([ [2, 22, 14, 45, 2, 0, 1, 2, 5], [2, 13, 13, 19, 2, 0, 1, 2, 5], [3, 25, 4, 3, 0, 1, 2, 0, 4], @@ -212,7 +213,7 @@ [1, 11, 10, 0, 0, 2, 1, 0, 2] ]) -SOYBEAN6 = np.array([ +TEST_DATA_PREDICT = np.array([ [2, 22, 14, 45, 2, 0, 1, 2, 5], [7, 13, 13, 19, 2, 0, 1, 2, 5], [5, 18, 19, 33, 0, 2, 1, 2, 2], @@ -470,24 +471,24 @@ def test_kmodes_predict_soybean_jaccard_dissim_binary(self): def test_kmodes_huang_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) - result = kmodes_huang.fit_predict(SOYBEAN4) - expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, - 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, - 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]) + result = kmodes_huang.fit_predict(TEST_DATA) + expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3, + 0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1, + 1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1, + 0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3, + 3, 0,0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) def test_kmodes_cao_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) - result = kmodes_huang.fit_predict(SOYBEAN4) - expected = np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 1, 0, - 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, - 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]) + result = kmodes_huang.fit_predict(TEST_DATA) + expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, + 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, + 1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1, + 0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -495,9 +496,9 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self): def test_kmodes_predict_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) - kmodes_huang = kmodes_huang.fit(SOYBEAN4) - result = kmodes_huang.fit_predict(SOYBEAN6) - expected = np.array([0, 0, 0, 0]) + kmodes_huang = kmodes_huang.fit(TEST_DATA) + result = kmodes_huang.fit_predict(TEST_DATA_PREDICT) + expected = np.array([0, 1, 0, 3]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index e5a5cf9..a2092c0 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -26,8 +26,10 @@ def jaccard_dissim_label(a, b, **__): i = 0 for row in a: intersect_len[i] = len(np.intersect1d(row, b)) - union_len[i] = len(row) + len(b) - intersect_len[i] + union_len[i] = len(np.unique(row)) + len(np.unique(b)) - intersect_len[i] i += 1 + if (union_len == 0).any(): + raise ValueError("Insufficient Number of data since union is 0") return intersect_len / union_len diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index 826de7e..1924541 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -42,9 +42,20 @@ def test_jaccard_dissim_binary(self): with self.assertRaises(ValueError): jaccard_dissim_binary(a, b) + # test for dissimilarity = 1 + a = np.array([[1, 1, 0, 1, 1, 0]]) + b = np.array([[1, 1, 0, 1, 1, 0]]) + assert_equal(1, jaccard_dissim_binary(a, b)) + + # test for dissimilarity = 0 + a = np.array([[0, 0, 1, 0, 0, 1]]) + b = np.array([[1, 1, 0, 1, 1, 0]]) + assert_equal(0, jaccard_dissim_binary(a, b)) + + def test_jaccard_dissim_label(self): a = np.array([[0, 1, 2, 0, 1, 2]]) - b = np.array([[0, 1, 2, 0, 1, 0]]) + b = np.array([[0, 1, 2, 0, 3, 0]]) assert_equal(0.75, jaccard_dissim_label(a, b)) a = np.array([[np.NaN, 1, 2, 0, 1, 2]]) @@ -52,6 +63,16 @@ def test_jaccard_dissim_label(self): with self.assertRaises(ValueError): jaccard_dissim_label(a, b) + # test for dissimilarity = 1 + a = np.array([[1, 2, 0, 3, 1, 0]]) + b = np.array([[1, 2, 0, 3, 1, 0]]) + assert_equal(1, jaccard_dissim_label(a, b)) + + # test for dissimilarity = 0 + a = np.array([[1, 2, 0, 3, 1, 0]]) + b = np.array([[5, 4, 6, 7, 8, 9]]) + assert_equal(0, jaccard_dissim_label(a, b)) + def test_euclidian_dissim(self): a = np.array([[0., 1., 2., 0., 1., 2.]]) From f4598d31e2c38f16452714065484623043464226 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Fri, 26 Jul 2019 22:23:04 +0530 Subject: [PATCH 10/12] resolving Quality check errors --- kmodes/tests/test_kmodes.py | 2 +- kmodes/util/tests/test_dissim.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index e93a9a9..3ec2137 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -476,7 +476,7 @@ def test_kmodes_huang_soybean_jaccard_dissim_label(self): 0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1, 1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1, 0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3, - 3, 0,0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1]) + 3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index 1924541..0f2f267 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -52,7 +52,6 @@ def test_jaccard_dissim_binary(self): b = np.array([[1, 1, 0, 1, 1, 0]]) assert_equal(0, jaccard_dissim_binary(a, b)) - def test_jaccard_dissim_label(self): a = np.array([[0, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 3, 0]]) From 9fe5b6dd274afb6a57b9961523144a578707da30 Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Fri, 26 Jul 2019 23:07:57 +0530 Subject: [PATCH 11/12] Final jaccard dissimilarity/distance logic established --- kmodes/tests/test_kmodes.py | 35 ++++++++++++++++---------------- kmodes/util/dissim.py | 4 ++-- kmodes/util/tests/test_dissim.py | 20 +++++++++--------- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 3ec2137..4d3db10 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -436,9 +436,9 @@ def test_kmodes_huang_soybean_jaccard_dissim_binary(self): # binary encoded variables are required bin_variables = SOYBEAN.astype(bool).astype(int) result = kmodes_huang.fit_predict(bin_variables) - expected = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, - 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, + 3, 3, 1, 1, 3, 1, 3, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -448,9 +448,9 @@ def test_kmodes_cao_soybean_jaccard_dissim_binary(self): # binary encoded variables are required bin_variables = SOYBEAN.astype(bool).astype(int) result = kmodes_Cao.fit_predict(bin_variables) - expected = np.array([3, 2, 2, 3, 3, 2, 3, 2, 2, 3, 1, 2, 1, 2, 1, 2, 1, - 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -464,7 +464,7 @@ def test_kmodes_predict_soybean_jaccard_dissim_binary(self): # binary encoded variables required for prediction as well bin_variables_pred = SOYBEAN2.astype(bool).astype(int) result = kmodes_huang.fit_predict(bin_variables_pred) - expected = np.array([1, 2, 1, 1]) + expected = np.array([0, 1, 2, 3]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -472,11 +472,11 @@ def test_kmodes_huang_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) result = kmodes_huang.fit_predict(TEST_DATA) - expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3, - 0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1, - 1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1, - 0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3, - 3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1]) + expected = np.array([3, 3, 2, 1, 1, 3, 3, 3, 3, 3, 0, 2, 2, 0, 0, 3, 3, 0, 0, + 0, 2, 2, 0, 3, 2, 3, 2, 2, 0, 1, 1, 0, 1, 1, 0, 2, 3, 3, + 3, 2, 2, 0, 0, 2, 1, 0, 0, 0, 2, 3, 0, 0, 2, 3, 2, 0, 2, + 2, 2, 3, 0, 3, 2, 2, 0, 0, 3, 2, 1, 3, 2, 0, 0, 2, 2, 2, + 3, 2, 2, 2, 2, 1, 3, 2, 2]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -484,11 +484,10 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) result = kmodes_huang.fit_predict(TEST_DATA) - expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, - 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, - 1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1, - 0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]) + expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, 0, 0, 1, 1, + 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 3, 1, 1, 2, 2, 0, 0, 2, + 0, 0, 0, 0, 3, 2, 2, 2, 0, 1, 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0, + 1, 1, 0, 1, 1, 0, 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) @@ -498,7 +497,7 @@ def test_kmodes_predict_soybean_jaccard_dissim_label(self): cat_dissim=jaccard_dissim_label, random_state=42) kmodes_huang = kmodes_huang.fit(TEST_DATA) result = kmodes_huang.fit_predict(TEST_DATA_PREDICT) - expected = np.array([0, 1, 0, 3]) + expected = np.array([1, 0, 1, 2]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index a2092c0..a764b5a 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -13,7 +13,7 @@ def matching_dissim(a, b, **_): def jaccard_dissim_binary(a, b, **__): """Jaccard dissimilarity function for binary encoded variables""" if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all(): - return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1) + return 1 - np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1) raise ValueError("Missing or non Binary values detected in Binary columns.") @@ -30,7 +30,7 @@ def jaccard_dissim_label(a, b, **__): i += 1 if (union_len == 0).any(): raise ValueError("Insufficient Number of data since union is 0") - return intersect_len / union_len + return 1 - intersect_len / union_len def euclidean_dissim(a, b, **_): diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index 0f2f267..24a2be0 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -29,7 +29,7 @@ def test_matching_dissim(self): def test_jaccard_dissim_binary(self): a = np.array([[0, 1, 1, 0, 1, 1]]) b = np.array([[0, 1, 1, 0, 1, 0]]) - assert_equal(0.75, jaccard_dissim_binary(a, b)) + assert_equal(0.25, jaccard_dissim_binary(a, b)) a = np.array([[0, 1, 1, 0, 1, 1]]) b = np.array([[0, np.NaN, 1, 0, 1, 0]]) @@ -42,35 +42,35 @@ def test_jaccard_dissim_binary(self): with self.assertRaises(ValueError): jaccard_dissim_binary(a, b) - # test for dissimilarity = 1 + # test for dissimilarity = 0 both sets are same a = np.array([[1, 1, 0, 1, 1, 0]]) b = np.array([[1, 1, 0, 1, 1, 0]]) - assert_equal(1, jaccard_dissim_binary(a, b)) + assert_equal(0, jaccard_dissim_binary(a, b)) - # test for dissimilarity = 0 + # test for dissimilarity = 0 sets are different a = np.array([[0, 0, 1, 0, 0, 1]]) b = np.array([[1, 1, 0, 1, 1, 0]]) - assert_equal(0, jaccard_dissim_binary(a, b)) + assert_equal(1, jaccard_dissim_binary(a, b)) def test_jaccard_dissim_label(self): a = np.array([[0, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 3, 0]]) - assert_equal(0.75, jaccard_dissim_label(a, b)) + assert_equal(0.25, jaccard_dissim_label(a, b)) a = np.array([[np.NaN, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 1, 0]]) with self.assertRaises(ValueError): jaccard_dissim_label(a, b) - # test for dissimilarity = 1 + # test for dissimilarity = 0 Both sets are same a = np.array([[1, 2, 0, 3, 1, 0]]) b = np.array([[1, 2, 0, 3, 1, 0]]) - assert_equal(1, jaccard_dissim_label(a, b)) + assert_equal(0, jaccard_dissim_label(a, b)) - # test for dissimilarity = 0 + # test for dissimilarity = 1 Both sets are different a = np.array([[1, 2, 0, 3, 1, 0]]) b = np.array([[5, 4, 6, 7, 8, 9]]) - assert_equal(0, jaccard_dissim_label(a, b)) + assert_equal(1, jaccard_dissim_label(a, b)) def test_euclidian_dissim(self): From 4f2efa060783bc13942cf289e9f143202bf5c74c Mon Sep 17 00:00:00 2001 From: BikashPandey17 Date: Fri, 26 Jul 2019 23:19:16 +0530 Subject: [PATCH 12/12] resolving Quality check errors and added denominator 0 check --- kmodes/tests/test_kmodes.py | 9 +++++---- kmodes/util/dissim.py | 7 ++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 4d3db10..6aecc0a 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -484,10 +484,11 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self): kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, cat_dissim=jaccard_dissim_label, random_state=42) result = kmodes_huang.fit_predict(TEST_DATA) - expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, 0, 0, 1, 1, - 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 3, 1, 1, 2, 2, 0, 0, 2, - 0, 0, 0, 0, 3, 2, 2, 2, 0, 1, 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0, - 1, 1, 0, 1, 1, 0, 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1]) + expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, + 0, 0, 1, 1, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, + 3, 1, 1, 2, 2, 0, 0, 2, 0, 0, 0, 0, 3, 2, 2, 2, 0, 1, + 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, + 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16)) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index a764b5a..08a15f6 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -13,7 +13,12 @@ def matching_dissim(a, b, **_): def jaccard_dissim_binary(a, b, **__): """Jaccard dissimilarity function for binary encoded variables""" if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all(): - return 1 - np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1) + numerator = np.sum(np.bitwise_and(a, b), axis=1) + denominator = np.sum(np.bitwise_or(a, b), axis=1) + if (denominator == 0).any(0): + raise ValueError("Insufficient Number of data since union is 0") + else: + return 1 - numerator / denominator raise ValueError("Missing or non Binary values detected in Binary columns.")