From 590a51f6dd704a85971df1327e54b7b9f99cf6f4 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Sun, 21 Jul 2019 13:36:20 +0530
Subject: [PATCH 01/12] Added jaccard dissimilarity function for label encoded
 and binary encoded variables along with related unit tests

---
 kmodes/tests/test_kmodes.py      | 276 ++++++++++++++++++++++++++++++-
 kmodes/util/dissim.py            |  21 +++
 kmodes/util/tests/test_dissim.py |  23 ++-
 3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 2811159..6f7f3bd 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -9,7 +9,7 @@
 from sklearn.utils.testing import assert_equal
 
 from kmodes.kmodes import KModes
-from kmodes.util.dissim import ng_dissim
+from kmodes.util.dissim import ng_dissim, jaccard_dissim_binary, jaccard_dissim_label
 
 
 SOYBEAN = np.array([
@@ -124,6 +124,214 @@
 # Drop target column
 SOYBEAN2 = SOYBEAN2[:, :35]
 
+# SOYBEAN Binary encoded
+SOYBEAN3 = np.array([
+    [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D3'],
+    [1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+    [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+])
+# Drop target column
+SOYBEAN3 = SOYBEAN3[:, :35]
+
+SOYBEAN4 = np.array([
+       [ 2, 22, 14, 45,  2,  0,  1,  2,  5],
+       [ 2, 13, 13, 19,  2,  0,  1,  2,  5],
+       [ 3, 25,  4,  3,  0,  1,  2,  0,  4],
+       [ 2, 13, 15, 18,  0,  1,  2,  2,  3],
+       [ 3, 10,  4, 42,  0,  2,  1,  1,  2],
+       [ 2, 16, 21, 14,  0,  1,  2,  2,  2],
+       [ 2, 16, 19, 37,  0,  2,  1,  2,  2],
+       [ 2, 20,  9, 34,  0,  1,  2,  3,  5],
+       [ 2, 14, 21, 44,  0,  1,  2,  3,  2],
+       [ 2, 26,  5, 30,  0,  1,  2,  3,  3],
+       [ 3, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 20,  1, 27,  3,  3,  3,  2,  0],
+       [ 3,  6,  8, 19,  0,  1,  2,  1,  2],
+       [ 2, 13,  8, 41,  3,  3,  3,  2,  0],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 16, 19, 42,  0,  1,  2,  2,  5],
+       [ 7,  7,  5, 43,  0,  2,  1,  2,  2],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 3,  3,  5, 12,  3,  3,  3,  2,  0],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 7, 15, 19, 17,  0,  1,  2,  2,  2],
+       [ 1,  1, 15, 24,  0,  1,  2,  2,  2],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2,  5,  7,  9,  0,  1,  2,  3,  5],
+       [ 2, 24,  6, 10,  0,  2,  1,  2,  2],
+       [ 2, 13, 16, 29,  0,  2,  1,  2,  2],
+       [ 3,  6,  8,  1,  0,  1,  2,  2,  5],
+       [ 2, 16, 15, 34,  0,  1,  2,  2,  1],
+       [ 0, 24, 14, 12,  3,  3,  3,  2,  0],
+       [ 3,  8, 21, 13,  3,  3,  3,  2,  0],
+       [ 2, 17, 15, 42,  3,  3,  3,  2,  0],
+       [ 2, 25, 18, 16,  3,  3,  3,  2,  0],
+       [ 2,  3, 15, 42,  3,  3,  3,  2,  0],
+       [ 6, 13, 15, 22,  3,  3,  3,  2,  0],
+       [ 3,  8, 18, 24,  1,  0,  2,  2,  5],
+       [ 7, 20, 15, 26,  1,  0,  2,  2,  1],
+       [ 2, 20,  7, 35,  0,  1,  2,  2,  5],
+       [ 2, 16, 12, 28,  0,  1,  2,  2,  5],
+       [ 2, 16,  5, 39,  0,  1,  2,  2,  2],
+       [ 3,  6, 11,  8,  0,  1,  2,  2,  2],
+       [ 7,  6, 15, 44,  1,  0,  2,  2,  4],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 16,  7,  6,  3,  3,  3,  2,  0],
+       [ 1, 13,  2, 46,  3,  3,  3,  2,  0],
+       [ 0, 14,  5, 41,  3,  3,  3,  2,  0],
+       [ 2, 24, 19,  0,  3,  3,  3,  2,  0],
+       [ 2, 14,  3, 35,  3,  3,  3,  2,  0],
+       [ 6, 19,  7,  5,  0,  2,  1,  2,  2],
+       [ 5,  6, 11, 44,  3,  3,  3,  2,  0],
+       [ 7, 16, 21, 21,  3,  3,  3,  2,  0],
+       [ 2, 19,  7, 44,  3,  3,  3,  2,  0],
+       [ 2, 24, 18, 33,  1,  0,  2,  1,  4],
+       [ 2, 16,  8, 44,  0,  2,  1,  2,  1],
+       [ 3,  2,  5, 15,  0,  1,  2,  2,  2],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2,  4, 15, 47,  0,  1,  2,  2,  2],
+       [ 7, 13, 15, 25,  0,  1,  2,  2,  1],
+       [ 1, 19, 10, 15,  3,  3,  3,  2,  0],
+       [ 2, 13,  5, 44,  0,  1,  2,  1,  2],
+       [ 5, 11, 18, 20,  3,  3,  3,  2,  0],
+       [ 7,  9,  5, 40,  0,  1,  2,  1,  4],
+       [ 3,  6, 16, 38,  3,  3,  3,  2,  0],
+       [ 2, 24, 22, 12,  0,  1,  2,  2,  3],
+       [ 5, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 16, 15, 23,  0,  1,  2,  2,  5],
+       [ 2, 13,  0, 25,  1,  0,  2,  2,  2],
+       [ 2, 23, 15, 36,  3,  3,  3,  2,  0],
+       [ 2, 25, 10,  2,  1,  0,  2,  2,  5],
+       [ 2, 21,  7,  4,  1,  0,  2,  2,  1],
+       [ 1, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [ 6,  9,  1,  0,  3,  3,  3,  2,  0],
+       [ 1,  7, 20, 47,  3,  3,  3,  2,  0],
+       [ 2, 25, 10,  7,  0,  1,  2,  2,  2],
+       [ 7,  0,  4, 32,  1,  2,  0,  2,  5],
+       [ 1, 12, 12, 15,  0,  1,  2,  3,  3],
+       [ 2, 26, 15, 25,  0,  1,  2,  0,  5],
+       [ 2, 20, 15, 19,  0,  1,  2,  2,  1],
+       [ 4,  6,  9, 11,  2,  0,  1,  1,  4],
+       [ 2, 13, 15, 42,  0,  2,  1,  2,  2],
+       [ 3,  5, 21, 31,  0,  1,  2,  3,  5],
+       [ 2, 13, 19, 33,  0,  2,  1,  2,  2],
+       [ 1, 11, 10,  0,  0,  2,  1,  0,  2]
+])
+
+SOYBEAN5 = np.array([
+    [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D1'],
+    [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D2'],
+    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 0, 'D3'],
+    [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
+     1, 0, 0, 0, 0, 0, 1, 'D4'],
+])
+# Drop target column
+SOYBEAN5 = SOYBEAN5[:, :35]
+
+SOYBEAN6 = np.array([
+       [ 2, 22, 14, 45,  2,  0,  1,  2,  5],
+       [ 7, 13, 13, 19,  2,  0,  1,  2,  5],
+       [ 5, 18, 19, 33,  0,  2,  1,  2,  2],
+       [ 1, 11, 10,  0,  0,  2,  1,  0,  2]
+])
+
 
 def assert_cluster_splits_equal(array1, array2):
 
@@ -334,6 +542,72 @@ def test_kmodes_nunique_nclusters_ng(self):
                                       np.array([[0, 2],
                                                 [0, 1]]))
 
+    def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_binary, random_state=42)
+        result = kmodes_huang.fit_predict(SOYBEAN3)
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3,
+                             3, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
+                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
+        kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
+                              cat_dissim=jaccard_dissim_binary, random_state=42)
+        result = kmodes_Cao.fit_predict(SOYBEAN3)
+        expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2,
+                             2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_binary, random_state=42)
+        kmodes_huang =kmodes_huang.fit(SOYBEAN3)
+        result = kmodes_huang.fit_predict(SOYBEAN5)
+        expected = np.array([1, 0, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+
+    def test_kmodes_huang_soybean_jaccard_dissim_label(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_label, random_state=42)
+        result = kmodes_huang.fit_predict(SOYBEAN4)
+        expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3,
+                             0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1,
+                             1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1,
+                             0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3,
+                             3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_cao_soybean_jaccard_dissim_label(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
+                              cat_dissim=jaccard_dissim_label, random_state=42)
+        result = kmodes_huang.fit_predict(SOYBEAN4)
+        expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+                             1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0,
+                             1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1,
+                             0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])
+
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+    def test_kmodes_predict_soybean_jaccard_dissim_label(self):
+        kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
+                              cat_dissim=jaccard_dissim_label, random_state=42)
+        kmodes_huang =kmodes_huang.fit(SOYBEAN4)
+        result = kmodes_huang.fit_predict(SOYBEAN6)
+        expected = np.array([0, 1, 0, 3])
+        assert_cluster_splits_equal(result, expected)
+        self.assertTrue(result.dtype == np.dtype(np.uint16))
+
+
     def test_kmodes_ninit(self):
         kmodes = KModes(n_init=10, init='Huang')
         self.assertEqual(kmodes.n_init, 10)
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index 4f27471..ecdfcd5 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -10,6 +10,27 @@ def matching_dissim(a, b, **_):
     return np.sum(a != b, axis=1)
 
 
+def jaccard_dissim_binary(a, b, **__):
+    """Jaccard dissimilarity function for biinary encoded variables"""
+    if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2:
+        raise ValueError("Missing or non Binary values detected in Binary columns.")
+    return np.sum(np.bitwise_and(a, b), axis=1)/np.sum(np.bitwise_or(a, b), axis=1)
+
+
+def jaccard_dissim_label(a, b, **__):
+    """Jaccard dissimilarity function for label encoded variables"""
+    if (a.astype(int) < 0).any() or (b.astype(int) < 0).any():
+        raise ValueError("Missing values detected in Numeric columns.")
+    intersect_len = np.empty(len(a), dtype=int)
+    union_len = np.empty(len(a), dtype=int)
+    i = 0
+    for row in a:
+        intersect_len[i] = len(np.intersect1d(row, b))
+        union_len[i] = len(np.union1d(row, b))
+        i = i+1
+    return intersect_len/union_len
+
+
 def euclidean_dissim(a, b, **_):
     """Euclidean distance dissimilarity function"""
     if np.isnan(a).any() or np.isnan(b).any():
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index 4d8404c..d6b1f35 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -7,7 +7,7 @@
 import numpy as np
 from sklearn.utils.testing import assert_equal, assert_array_equal
 
-from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim
+from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim, jaccard_dissim_binary, jaccard_dissim_label
 
 
 class TestDissimilarityMeasures(unittest.TestCase):
@@ -25,6 +25,27 @@ def test_matching_dissim(self):
         b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']])
         assert_array_equal(np.array([0, 4]), matching_dissim(a, b))
 
+    def test_jaccard_dissim_binary(self):
+        a = np.array([[0, 1, 1, 0, 1, 1]])
+        b = np.array([[0, 1, 1, 0, 1, 0]])
+        assert_equal(0.75, jaccard_dissim_binary(a, b))
+
+        a = np.array([[0, 1, 1, 0, 1, 1]])
+        b = np.array([[0, np.NaN, 1, 0, 1, 0]])
+        with self.assertRaises(ValueError):
+            jaccard_dissim_binary(a, b)
+
+    def test_jaccard_dissim_label(self):
+        a = np.array([[0, 1, 2, 0, 1, 2]])
+        b = np.array([[0, 1, 2, 0, 1, 0]])
+        assert_equal(1, jaccard_dissim_label(a, b))
+
+        a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
+        b = np.array([[0, 1, 2, 0, 1, 0]])
+        with self.assertRaises(ValueError):
+            jaccard_dissim_label(a, b)
+
+
     def test_euclidian_dissim(self):
         a = np.array([[0., 1., 2., 0., 1., 2.]])
         b = np.array([[3., 1., 3., 0., 1., 0.]])

From 3c18f3b115f334b7c8219a95901a8eae35d56ce8 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Sun, 21 Jul 2019 14:20:45 +0530
Subject: [PATCH 02/12] Resolving errors in Quality check

---
 kmodes/tests/test_kmodes.py      | 170 +++++++++++++++----------------
 kmodes/util/dissim.py            |   6 +-
 kmodes/util/tests/test_dissim.py |   3 +-
 3 files changed, 90 insertions(+), 89 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 6f7f3bd..1bc6a5d 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -225,91 +225,91 @@
 SOYBEAN3 = SOYBEAN3[:, :35]
 
 SOYBEAN4 = np.array([
-       [ 2, 22, 14, 45,  2,  0,  1,  2,  5],
-       [ 2, 13, 13, 19,  2,  0,  1,  2,  5],
-       [ 3, 25,  4,  3,  0,  1,  2,  0,  4],
-       [ 2, 13, 15, 18,  0,  1,  2,  2,  3],
-       [ 3, 10,  4, 42,  0,  2,  1,  1,  2],
-       [ 2, 16, 21, 14,  0,  1,  2,  2,  2],
-       [ 2, 16, 19, 37,  0,  2,  1,  2,  2],
-       [ 2, 20,  9, 34,  0,  1,  2,  3,  5],
-       [ 2, 14, 21, 44,  0,  1,  2,  3,  2],
-       [ 2, 26,  5, 30,  0,  1,  2,  3,  3],
-       [ 3, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 20,  1, 27,  3,  3,  3,  2,  0],
-       [ 3,  6,  8, 19,  0,  1,  2,  1,  2],
-       [ 2, 13,  8, 41,  3,  3,  3,  2,  0],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 16, 19, 42,  0,  1,  2,  2,  5],
-       [ 7,  7,  5, 43,  0,  2,  1,  2,  2],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 3,  3,  5, 12,  3,  3,  3,  2,  0],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 7, 15, 19, 17,  0,  1,  2,  2,  2],
-       [ 1,  1, 15, 24,  0,  1,  2,  2,  2],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2,  5,  7,  9,  0,  1,  2,  3,  5],
-       [ 2, 24,  6, 10,  0,  2,  1,  2,  2],
-       [ 2, 13, 16, 29,  0,  2,  1,  2,  2],
-       [ 3,  6,  8,  1,  0,  1,  2,  2,  5],
-       [ 2, 16, 15, 34,  0,  1,  2,  2,  1],
-       [ 0, 24, 14, 12,  3,  3,  3,  2,  0],
-       [ 3,  8, 21, 13,  3,  3,  3,  2,  0],
-       [ 2, 17, 15, 42,  3,  3,  3,  2,  0],
-       [ 2, 25, 18, 16,  3,  3,  3,  2,  0],
-       [ 2,  3, 15, 42,  3,  3,  3,  2,  0],
-       [ 6, 13, 15, 22,  3,  3,  3,  2,  0],
-       [ 3,  8, 18, 24,  1,  0,  2,  2,  5],
-       [ 7, 20, 15, 26,  1,  0,  2,  2,  1],
-       [ 2, 20,  7, 35,  0,  1,  2,  2,  5],
-       [ 2, 16, 12, 28,  0,  1,  2,  2,  5],
-       [ 2, 16,  5, 39,  0,  1,  2,  2,  2],
-       [ 3,  6, 11,  8,  0,  1,  2,  2,  2],
-       [ 7,  6, 15, 44,  1,  0,  2,  2,  4],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 16,  7,  6,  3,  3,  3,  2,  0],
-       [ 1, 13,  2, 46,  3,  3,  3,  2,  0],
-       [ 0, 14,  5, 41,  3,  3,  3,  2,  0],
-       [ 2, 24, 19,  0,  3,  3,  3,  2,  0],
-       [ 2, 14,  3, 35,  3,  3,  3,  2,  0],
-       [ 6, 19,  7,  5,  0,  2,  1,  2,  2],
-       [ 5,  6, 11, 44,  3,  3,  3,  2,  0],
-       [ 7, 16, 21, 21,  3,  3,  3,  2,  0],
-       [ 2, 19,  7, 44,  3,  3,  3,  2,  0],
-       [ 2, 24, 18, 33,  1,  0,  2,  1,  4],
-       [ 2, 16,  8, 44,  0,  2,  1,  2,  1],
-       [ 3,  2,  5, 15,  0,  1,  2,  2,  2],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2,  4, 15, 47,  0,  1,  2,  2,  2],
-       [ 7, 13, 15, 25,  0,  1,  2,  2,  1],
-       [ 1, 19, 10, 15,  3,  3,  3,  2,  0],
-       [ 2, 13,  5, 44,  0,  1,  2,  1,  2],
-       [ 5, 11, 18, 20,  3,  3,  3,  2,  0],
-       [ 7,  9,  5, 40,  0,  1,  2,  1,  4],
-       [ 3,  6, 16, 38,  3,  3,  3,  2,  0],
-       [ 2, 24, 22, 12,  0,  1,  2,  2,  3],
-       [ 5, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 16, 15, 23,  0,  1,  2,  2,  5],
-       [ 2, 13,  0, 25,  1,  0,  2,  2,  2],
-       [ 2, 23, 15, 36,  3,  3,  3,  2,  0],
-       [ 2, 25, 10,  2,  1,  0,  2,  2,  5],
-       [ 2, 21,  7,  4,  1,  0,  2,  2,  1],
-       [ 1, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [ 6,  9,  1,  0,  3,  3,  3,  2,  0],
-       [ 1,  7, 20, 47,  3,  3,  3,  2,  0],
-       [ 2, 25, 10,  7,  0,  1,  2,  2,  2],
-       [ 7,  0,  4, 32,  1,  2,  0,  2,  5],
-       [ 1, 12, 12, 15,  0,  1,  2,  3,  3],
-       [ 2, 26, 15, 25,  0,  1,  2,  0,  5],
-       [ 2, 20, 15, 19,  0,  1,  2,  2,  1],
-       [ 4,  6,  9, 11,  2,  0,  1,  1,  4],
-       [ 2, 13, 15, 42,  0,  2,  1,  2,  2],
-       [ 3,  5, 21, 31,  0,  1,  2,  3,  5],
-       [ 2, 13, 19, 33,  0,  2,  1,  2,  2],
-       [ 1, 11, 10,  0,  0,  2,  1,  0,  2]
+       [2, 22, 14, 45,  2,  0,  1,  2,  5],
+       [2, 13, 13, 19,  2,  0,  1,  2,  5],
+       [3, 25,  4,  3,  0,  1,  2,  0,  4],
+       [2, 13, 15, 18,  0,  1,  2,  2,  3],
+       [3, 10,  4, 42,  0,  2,  1,  1,  2],
+       [2, 16, 21, 14,  0,  1,  2,  2,  2],
+       [2, 16, 19, 37,  0,  2,  1,  2,  2],
+       [2, 20,  9, 34,  0,  1,  2,  3,  5],
+       [2, 14, 21, 44,  0,  1,  2,  3,  2],
+       [2, 26,  5, 30,  0,  1,  2,  3,  3],
+       [3, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 20,  1, 27,  3,  3,  3,  2,  0],
+       [3,  6,  8, 19,  0,  1,  2,  1,  2],
+       [2, 13,  8, 41,  3,  3,  3,  2,  0],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 16, 19, 42,  0,  1,  2,  2,  5],
+       [7,  7,  5, 43,  0,  2,  1,  2,  2],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [3,  3,  5, 12,  3,  3,  3,  2,  0],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [7, 15, 19, 17,  0,  1,  2,  2,  2],
+       [1,  1, 15, 24,  0,  1,  2,  2,  2],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2,  5,  7,  9,  0,  1,  2,  3,  5],
+       [2, 24,  6, 10,  0,  2,  1,  2,  2],
+       [2, 13, 16, 29,  0,  2,  1,  2,  2],
+       [3,  6,  8,  1,  0,  1,  2,  2,  5],
+       [2, 16, 15, 34,  0,  1,  2,  2,  1],
+       [0, 24, 14, 12,  3,  3,  3,  2,  0],
+       [3,  8, 21, 13,  3,  3,  3,  2,  0],
+       [2, 17, 15, 42,  3,  3,  3,  2,  0],
+       [2, 25, 18, 16,  3,  3,  3,  2,  0],
+       [2,  3, 15, 42,  3,  3,  3,  2,  0],
+       [6, 13, 15, 22,  3,  3,  3,  2,  0],
+       [3,  8, 18, 24,  1,  0,  2,  2,  5],
+       [7, 20, 15, 26,  1,  0,  2,  2,  1],
+       [2, 20,  7, 35,  0,  1,  2,  2,  5],
+       [2, 16, 12, 28,  0,  1,  2,  2,  5],
+       [2, 16,  5, 39,  0,  1,  2,  2,  2],
+       [3,  6, 11,  8,  0,  1,  2,  2,  2],
+       [7,  6, 15, 44,  1,  0,  2,  2,  4],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 16,  7,  6,  3,  3,  3,  2,  0],
+       [1, 13,  2, 46,  3,  3,  3,  2,  0],
+       [0, 14,  5, 41,  3,  3,  3,  2,  0],
+       [2, 24, 19,  0,  3,  3,  3,  2,  0],
+       [2, 14,  3, 35,  3,  3,  3,  2,  0],
+       [6, 19,  7,  5,  0,  2,  1,  2,  2],
+       [5,  6, 11, 44,  3,  3,  3,  2,  0],
+       [7, 16, 21, 21,  3,  3,  3,  2,  0],
+       [2, 19,  7, 44,  3,  3,  3,  2,  0],
+       [2, 24, 18, 33,  1,  0,  2,  1,  4],
+       [2, 16,  8, 44,  0,  2,  1,  2,  1],
+       [3,  2,  5, 15,  0,  1,  2,  2,  2],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2,  4, 15, 47,  0,  1,  2,  2,  2],
+       [7, 13, 15, 25,  0,  1,  2,  2,  1],
+       [1, 19, 10, 15,  3,  3,  3,  2,  0],
+       [2, 13,  5, 44,  0,  1,  2,  1,  2],
+       [5, 11, 18, 20,  3,  3,  3,  2,  0],
+       [7,  9,  5, 40,  0,  1,  2,  1,  4],
+       [3,  6, 16, 38,  3,  3,  3,  2,  0],
+       [2, 24, 22, 12,  0,  1,  2,  2,  3],
+       [5, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 16, 15, 23,  0,  1,  2,  2,  5],
+       [2, 13,  0, 25,  1,  0,  2,  2,  2],
+       [2, 23, 15, 36,  3,  3,  3,  2,  0],
+       [2, 25, 10,  2,  1,  0,  2,  2,  5],
+       [2, 21,  7,  4,  1,  0,  2,  2,  1],
+       [1, 18, 17, 41,  3,  3,  3,  2,  0],
+       [2, 18, 17, 41,  3,  3,  3,  2,  0],
+       [6,  9,  1,  0,  3,  3,  3,  2,  0],
+       [1,  7, 20, 47,  3,  3,  3,  2,  0],
+       [2, 25, 10,  7,  0,  1,  2,  2,  2],
+       [7,  0,  4, 32,  1,  2,  0,  2,  5],
+       [1, 12, 12, 15,  0,  1,  2,  3,  3],
+       [2, 26, 15, 25,  0,  1,  2,  0,  5],
+       [2, 20, 15, 19,  0,  1,  2,  2,  1],
+       [4,  6,  9, 11,  2,  0,  1,  1,  4],
+       [2, 13, 15, 42,  0,  2,  1,  2,  2],
+       [3,  5, 21, 31,  0,  1,  2,  3,  5],
+       [2, 13, 19, 33,  0,  2,  1,  2,  2],
+       [1, 11, 10,  0,  0,  2,  1,  0,  2]
 ])
 
 SOYBEAN5 = np.array([
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index ecdfcd5..1b81bc8 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -14,7 +14,7 @@ def jaccard_dissim_binary(a, b, **__):
     """Jaccard dissimilarity function for biinary encoded variables"""
     if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2:
         raise ValueError("Missing or non Binary values detected in Binary columns.")
-    return np.sum(np.bitwise_and(a, b), axis=1)/np.sum(np.bitwise_or(a, b), axis=1)
+    return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
 
 
 def jaccard_dissim_label(a, b, **__):
@@ -27,8 +27,8 @@ def jaccard_dissim_label(a, b, **__):
     for row in a:
         intersect_len[i] = len(np.intersect1d(row, b))
         union_len[i] = len(np.union1d(row, b))
-        i = i+1
-    return intersect_len/union_len
+        i = i + 1
+    return intersect_len / union_len
 
 
 def euclidean_dissim(a, b, **_):
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index d6b1f35..fd713af 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -7,7 +7,8 @@
 import numpy as np
 from sklearn.utils.testing import assert_equal, assert_array_equal
 
-from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim, jaccard_dissim_binary, jaccard_dissim_label
+from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim
+from kmodes.util.dissim import jaccard_dissim_binary, jaccard_dissim_label
 
 
 class TestDissimilarityMeasures(unittest.TestCase):

From c3741dfc81be273cd93a6538d247ad24422b5cc8 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Sun, 21 Jul 2019 14:28:11 +0530
Subject: [PATCH 03/12] Resolving errors in Quality check

---
 kmodes/tests/test_kmodes.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 1bc6a5d..2030294 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -326,10 +326,10 @@
 SOYBEAN5 = SOYBEAN5[:, :35]
 
 SOYBEAN6 = np.array([
-       [ 2, 22, 14, 45,  2,  0,  1,  2,  5],
-       [ 7, 13, 13, 19,  2,  0,  1,  2,  5],
-       [ 5, 18, 19, 33,  0,  2,  1,  2,  2],
-       [ 1, 11, 10,  0,  0,  2,  1,  0,  2]
+       [2, 22, 14, 45,  2,  0,  1,  2,  5],
+       [7, 13, 13, 19,  2,  0,  1,  2,  5],
+       [5, 18, 19, 33,  0,  2,  1,  2,  2],
+       [1, 11, 10,  0,  0,  2,  1,  0,  2]
 ])
 
 
@@ -554,7 +554,7 @@ def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
 
     def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
         kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
-                              cat_dissim=jaccard_dissim_binary, random_state=42)
+                            cat_dissim=jaccard_dissim_binary, random_state=42)
         result = kmodes_Cao.fit_predict(SOYBEAN3)
         expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2,
                              2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0,
@@ -566,13 +566,12 @@ def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
     def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_binary, random_state=42)
-        kmodes_huang =kmodes_huang.fit(SOYBEAN3)
+        kmodes_huang = kmodes_huang.fit(SOYBEAN3)
         result = kmodes_huang.fit_predict(SOYBEAN5)
         expected = np.array([1, 0, 1, 1])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
-
     def test_kmodes_huang_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)

From f8dbc153f5140f72600a50ececfc6f98f86ff6ba Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Sun, 21 Jul 2019 14:35:27 +0530
Subject: [PATCH 04/12] Resolving errors in Quality check

---
 kmodes/tests/test_kmodes.py | 172 ++++++++++++++++++------------------
 1 file changed, 86 insertions(+), 86 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 2030294..e3af29d 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -225,91 +225,91 @@
 SOYBEAN3 = SOYBEAN3[:, :35]
 
 SOYBEAN4 = np.array([
-       [2, 22, 14, 45,  2,  0,  1,  2,  5],
-       [2, 13, 13, 19,  2,  0,  1,  2,  5],
-       [3, 25,  4,  3,  0,  1,  2,  0,  4],
-       [2, 13, 15, 18,  0,  1,  2,  2,  3],
-       [3, 10,  4, 42,  0,  2,  1,  1,  2],
-       [2, 16, 21, 14,  0,  1,  2,  2,  2],
-       [2, 16, 19, 37,  0,  2,  1,  2,  2],
-       [2, 20,  9, 34,  0,  1,  2,  3,  5],
-       [2, 14, 21, 44,  0,  1,  2,  3,  2],
-       [2, 26,  5, 30,  0,  1,  2,  3,  3],
-       [3, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 20,  1, 27,  3,  3,  3,  2,  0],
-       [3,  6,  8, 19,  0,  1,  2,  1,  2],
-       [2, 13,  8, 41,  3,  3,  3,  2,  0],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 16, 19, 42,  0,  1,  2,  2,  5],
-       [7,  7,  5, 43,  0,  2,  1,  2,  2],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [3,  3,  5, 12,  3,  3,  3,  2,  0],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [7, 15, 19, 17,  0,  1,  2,  2,  2],
-       [1,  1, 15, 24,  0,  1,  2,  2,  2],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2,  5,  7,  9,  0,  1,  2,  3,  5],
-       [2, 24,  6, 10,  0,  2,  1,  2,  2],
-       [2, 13, 16, 29,  0,  2,  1,  2,  2],
-       [3,  6,  8,  1,  0,  1,  2,  2,  5],
-       [2, 16, 15, 34,  0,  1,  2,  2,  1],
-       [0, 24, 14, 12,  3,  3,  3,  2,  0],
-       [3,  8, 21, 13,  3,  3,  3,  2,  0],
-       [2, 17, 15, 42,  3,  3,  3,  2,  0],
-       [2, 25, 18, 16,  3,  3,  3,  2,  0],
-       [2,  3, 15, 42,  3,  3,  3,  2,  0],
-       [6, 13, 15, 22,  3,  3,  3,  2,  0],
-       [3,  8, 18, 24,  1,  0,  2,  2,  5],
-       [7, 20, 15, 26,  1,  0,  2,  2,  1],
-       [2, 20,  7, 35,  0,  1,  2,  2,  5],
-       [2, 16, 12, 28,  0,  1,  2,  2,  5],
-       [2, 16,  5, 39,  0,  1,  2,  2,  2],
-       [3,  6, 11,  8,  0,  1,  2,  2,  2],
-       [7,  6, 15, 44,  1,  0,  2,  2,  4],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 16,  7,  6,  3,  3,  3,  2,  0],
-       [1, 13,  2, 46,  3,  3,  3,  2,  0],
-       [0, 14,  5, 41,  3,  3,  3,  2,  0],
-       [2, 24, 19,  0,  3,  3,  3,  2,  0],
-       [2, 14,  3, 35,  3,  3,  3,  2,  0],
-       [6, 19,  7,  5,  0,  2,  1,  2,  2],
-       [5,  6, 11, 44,  3,  3,  3,  2,  0],
-       [7, 16, 21, 21,  3,  3,  3,  2,  0],
-       [2, 19,  7, 44,  3,  3,  3,  2,  0],
-       [2, 24, 18, 33,  1,  0,  2,  1,  4],
-       [2, 16,  8, 44,  0,  2,  1,  2,  1],
-       [3,  2,  5, 15,  0,  1,  2,  2,  2],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2,  4, 15, 47,  0,  1,  2,  2,  2],
-       [7, 13, 15, 25,  0,  1,  2,  2,  1],
-       [1, 19, 10, 15,  3,  3,  3,  2,  0],
-       [2, 13,  5, 44,  0,  1,  2,  1,  2],
-       [5, 11, 18, 20,  3,  3,  3,  2,  0],
-       [7,  9,  5, 40,  0,  1,  2,  1,  4],
-       [3,  6, 16, 38,  3,  3,  3,  2,  0],
-       [2, 24, 22, 12,  0,  1,  2,  2,  3],
-       [5, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 16, 15, 23,  0,  1,  2,  2,  5],
-       [2, 13,  0, 25,  1,  0,  2,  2,  2],
-       [2, 23, 15, 36,  3,  3,  3,  2,  0],
-       [2, 25, 10,  2,  1,  0,  2,  2,  5],
-       [2, 21,  7,  4,  1,  0,  2,  2,  1],
-       [1, 18, 17, 41,  3,  3,  3,  2,  0],
-       [2, 18, 17, 41,  3,  3,  3,  2,  0],
-       [6,  9,  1,  0,  3,  3,  3,  2,  0],
-       [1,  7, 20, 47,  3,  3,  3,  2,  0],
-       [2, 25, 10,  7,  0,  1,  2,  2,  2],
-       [7,  0,  4, 32,  1,  2,  0,  2,  5],
-       [1, 12, 12, 15,  0,  1,  2,  3,  3],
-       [2, 26, 15, 25,  0,  1,  2,  0,  5],
-       [2, 20, 15, 19,  0,  1,  2,  2,  1],
-       [4,  6,  9, 11,  2,  0,  1,  1,  4],
-       [2, 13, 15, 42,  0,  2,  1,  2,  2],
-       [3,  5, 21, 31,  0,  1,  2,  3,  5],
-       [2, 13, 19, 33,  0,  2,  1,  2,  2],
-       [1, 11, 10,  0,  0,  2,  1,  0,  2]
+    [2, 22, 14, 45,  2,  0,  1,  2,  5],
+    [2, 13, 13, 19,  2,  0,  1,  2,  5],
+    [3, 25,  4,  3,  0,  1,  2,  0,  4],
+    [2, 13, 15, 18,  0,  1,  2,  2,  3],
+    [3, 10,  4, 42,  0,  2,  1,  1,  2],
+    [2, 16, 21, 14,  0,  1,  2,  2,  2],
+    [2, 16, 19, 37,  0,  2,  1,  2,  2],
+    [2, 20,  9, 34,  0,  1,  2,  3,  5],
+    [2, 14, 21, 44,  0,  1,  2,  3,  2],
+    [2, 26,  5, 30,  0,  1,  2,  3,  3],
+    [3, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 20,  1, 27,  3,  3,  3,  2,  0],
+    [3,  6,  8, 19,  0,  1,  2,  1,  2],
+    [2, 13,  8, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 16, 19, 42,  0,  1,  2,  2,  5],
+    [7,  7,  5, 43,  0,  2,  1,  2,  2],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [3,  3,  5, 12,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [7, 15, 19, 17,  0,  1,  2,  2,  2],
+    [1,  1, 15, 24,  0,  1,  2,  2,  2],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2,  5,  7,  9,  0,  1,  2,  3,  5],
+    [2, 24,  6, 10,  0,  2,  1,  2,  2],
+    [2, 13, 16, 29,  0,  2,  1,  2,  2],
+    [3,  6,  8,  1,  0,  1,  2,  2,  5],
+    [2, 16, 15, 34,  0,  1,  2,  2,  1],
+    [0, 24, 14, 12,  3,  3,  3,  2,  0],
+    [3,  8, 21, 13,  3,  3,  3,  2,  0],
+    [2, 17, 15, 42,  3,  3,  3,  2,  0],
+    [2, 25, 18, 16,  3,  3,  3,  2,  0],
+    [2,  3, 15, 42,  3,  3,  3,  2,  0],
+    [6, 13, 15, 22,  3,  3,  3,  2,  0],
+    [3,  8, 18, 24,  1,  0,  2,  2,  5],
+    [7, 20, 15, 26,  1,  0,  2,  2,  1],
+    [2, 20,  7, 35,  0,  1,  2,  2,  5],
+    [2, 16, 12, 28,  0,  1,  2,  2,  5],
+    [2, 16,  5, 39,  0,  1,  2,  2,  2],
+    [3,  6, 11,  8,  0,  1,  2,  2,  2],
+    [7,  6, 15, 44,  1,  0,  2,  2,  4],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 16,  7,  6,  3,  3,  3,  2,  0],
+    [1, 13,  2, 46,  3,  3,  3,  2,  0],
+    [0, 14,  5, 41,  3,  3,  3,  2,  0],
+    [2, 24, 19,  0,  3,  3,  3,  2,  0],
+    [2, 14,  3, 35,  3,  3,  3,  2,  0],
+    [6, 19,  7,  5,  0,  2,  1,  2,  2],
+    [5,  6, 11, 44,  3,  3,  3,  2,  0],
+    [7, 16, 21, 21,  3,  3,  3,  2,  0],
+    [2, 19,  7, 44,  3,  3,  3,  2,  0],
+    [2, 24, 18, 33,  1,  0,  2,  1,  4],
+    [2, 16,  8, 44,  0,  2,  1,  2,  1],
+    [3,  2,  5, 15,  0,  1,  2,  2,  2],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2,  4, 15, 47,  0,  1,  2,  2,  2],
+    [7, 13, 15, 25,  0,  1,  2,  2,  1],
+    [1, 19, 10, 15,  3,  3,  3,  2,  0],
+    [2, 13,  5, 44,  0,  1,  2,  1,  2],
+    [5, 11, 18, 20,  3,  3,  3,  2,  0],
+    [7,  9,  5, 40,  0,  1,  2,  1,  4],
+    [3,  6, 16, 38,  3,  3,  3,  2,  0],
+    [2, 24, 22, 12,  0,  1,  2,  2,  3],
+    [5, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 16, 15, 23,  0,  1,  2,  2,  5],
+    [2, 13,  0, 25,  1,  0,  2,  2,  2],
+    [2, 23, 15, 36,  3,  3,  3,  2,  0],
+    [2, 25, 10,  2,  1,  0,  2,  2,  5],
+    [2, 21,  7,  4,  1,  0,  2,  2,  1],
+    [1, 18, 17, 41,  3,  3,  3,  2,  0],
+    [2, 18, 17, 41,  3,  3,  3,  2,  0],
+    [6,  9,  1,  0,  3,  3,  3,  2,  0],
+    [1,  7, 20, 47,  3,  3,  3,  2,  0],
+    [2, 25, 10,  7,  0,  1,  2,  2,  2],
+    [7,  0,  4, 32,  1,  2,  0,  2,  5],
+    [1, 12, 12, 15,  0,  1,  2,  3,  3],
+    [2, 26, 15, 25,  0,  1,  2,  0,  5],
+    [2, 20, 15, 19,  0,  1,  2,  2,  1],
+    [4,  6,  9, 11,  2,  0,  1,  1,  4],
+    [2, 13, 15, 42,  0,  2,  1,  2,  2],
+    [3,  5, 21, 31,  0,  1,  2,  3,  5],
+    [2, 13, 19, 33,  0,  2,  1,  2,  2],
+    [1, 11, 10,  0,  0,  2,  1,  0,  2]
 ])
 
 SOYBEAN5 = np.array([
@@ -600,7 +600,7 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self):
     def test_kmodes_predict_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
-        kmodes_huang =kmodes_huang.fit(SOYBEAN4)
+        kmodes_huang = kmodes_huang.fit(SOYBEAN4)
         result = kmodes_huang.fit_predict(SOYBEAN6)
         expected = np.array([0, 1, 0, 3])
         assert_cluster_splits_equal(result, expected)

From 0bcc371847ab14c3e99ddf91d1b7b042111fde85 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Sun, 21 Jul 2019 14:38:46 +0530
Subject: [PATCH 05/12] Resolving errors in Quality check

---
 kmodes/tests/test_kmodes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index e3af29d..1420fd5 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -326,10 +326,10 @@
 SOYBEAN5 = SOYBEAN5[:, :35]
 
 SOYBEAN6 = np.array([
-       [2, 22, 14, 45,  2,  0,  1,  2,  5],
-       [7, 13, 13, 19,  2,  0,  1,  2,  5],
-       [5, 18, 19, 33,  0,  2,  1,  2,  2],
-       [1, 11, 10,  0,  0,  2,  1,  0,  2]
+    [2, 22, 14, 45,  2,  0,  1,  2,  5],
+    [7, 13, 13, 19,  2,  0,  1,  2,  5],
+    [5, 18, 19, 33,  0,  2,  1,  2,  2],
+    [1, 11, 10,  0,  0,  2,  1,  0,  2]
 ])
 
 

From 44fcc68888ceab974288573ab6d115c2e5165947 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Sun, 21 Jul 2019 16:32:12 +0530
Subject: [PATCH 06/12] Resolving Build errors for python ver 3.6.0

---
 kmodes/tests/test_kmodes.py      | 22 +++++++++++-----------
 kmodes/util/dissim.py            |  3 ++-
 kmodes/util/tests/test_dissim.py |  2 +-
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 1420fd5..a09bf6c 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -576,11 +576,11 @@ def test_kmodes_huang_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
         result = kmodes_huang.fit_predict(SOYBEAN4)
-        expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3,
-                             0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1,
-                             1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1,
-                             0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3,
-                             3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1])
+        expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+                             0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
+                             0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2,
+                             0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1,
+                             0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
@@ -588,11 +588,11 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
         result = kmodes_huang.fit_predict(SOYBEAN4)
-        expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-                             1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0,
-                             1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1,
-                             0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])
+        expected = np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 1, 0,
+                             1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
+                             0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0])
 
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
@@ -602,7 +602,7 @@ def test_kmodes_predict_soybean_jaccard_dissim_label(self):
                               cat_dissim=jaccard_dissim_label, random_state=42)
         kmodes_huang = kmodes_huang.fit(SOYBEAN4)
         result = kmodes_huang.fit_predict(SOYBEAN6)
-        expected = np.array([0, 1, 0, 3])
+        expected = np.array([0, 0, 0, 0])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index 1b81bc8..f2d3df2 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -26,7 +26,8 @@ def jaccard_dissim_label(a, b, **__):
     i = 0
     for row in a:
         intersect_len[i] = len(np.intersect1d(row, b))
-        union_len[i] = len(np.union1d(row, b))
+        union_len[i] = len(row) + len(b) - intersect_len[i]
+        # union_len[i] = np.unique(np.concatenate((row, b)))
         i = i + 1
     return intersect_len / union_len
 
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index fd713af..2db5edf 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -39,7 +39,7 @@ def test_jaccard_dissim_binary(self):
     def test_jaccard_dissim_label(self):
         a = np.array([[0, 1, 2, 0, 1, 2]])
         b = np.array([[0, 1, 2, 0, 1, 0]])
-        assert_equal(1, jaccard_dissim_label(a, b))
+        assert_equal(0.75, jaccard_dissim_label(a, b))
 
         a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
         b = np.array([[0, 1, 2, 0, 1, 0]])

From 8e97e631da24177368b9f6ae7cd4b5d4f2baa304 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <BIKASH.PANDEY@springandriver.com>
Date: Tue, 23 Jul 2019 11:51:22 +0530
Subject: [PATCH 07/12] Added some recommended changes

---
 kmodes/tests/test_kmodes.py      | 141 ++++---------------------------
 kmodes/util/dissim.py            |  13 ++-
 kmodes/util/tests/test_dissim.py |   6 ++
 3 files changed, 30 insertions(+), 130 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index a09bf6c..2e43f9a 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -124,106 +124,6 @@
 # Drop target column
 SOYBEAN2 = SOYBEAN2[:, :35]
 
-# SOYBEAN Binary encoded
-SOYBEAN3 = np.array([
-    [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D3'],
-    [1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-    [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-])
-# Drop target column
-SOYBEAN3 = SOYBEAN3[:, :35]
-
 SOYBEAN4 = np.array([
     [2, 22, 14, 45,  2,  0,  1,  2,  5],
     [2, 13, 13, 19,  2,  0,  1,  2,  5],
@@ -312,19 +212,6 @@
     [1, 11, 10,  0,  0,  2,  1,  0,  2]
 ])
 
-SOYBEAN5 = np.array([
-    [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D1'],
-    [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D2'],
-    [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 0, 'D3'],
-    [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
-     1, 0, 0, 0, 0, 0, 1, 'D4'],
-])
-# Drop target column
-SOYBEAN5 = SOYBEAN5[:, :35]
-
 SOYBEAN6 = np.array([
     [2, 22, 14, 45,  2,  0,  1,  2,  5],
     [7, 13, 13, 19,  2,  0,  1,  2,  5],
@@ -545,19 +432,23 @@ def test_kmodes_nunique_nclusters_ng(self):
     def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_binary, random_state=42)
-        result = kmodes_huang.fit_predict(SOYBEAN3)
-        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3,
-                             3, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+        # binary encoded variables are required
+        bin_variables = SOYBEAN.astype(bool).astype(int)
+        result = kmodes_huang.fit_predict(bin_variables)
+        expected = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1,
+                             1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
     def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
         kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
                             cat_dissim=jaccard_dissim_binary, random_state=42)
-        result = kmodes_Cao.fit_predict(SOYBEAN3)
-        expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 1, 2,
-                             2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0,
+        # binary encoded variables are required
+        bin_variables = SOYBEAN.astype(bool).astype(int)
+        result = kmodes_Cao.fit_predict(bin_variables)
+        expected = np.array([3, 2, 2, 3, 3, 2, 3, 2, 2, 3, 1, 2, 1, 2, 1, 2, 1,
+                             1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 
         assert_cluster_splits_equal(result, expected)
@@ -566,9 +457,13 @@ def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
     def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_binary, random_state=42)
-        kmodes_huang = kmodes_huang.fit(SOYBEAN3)
-        result = kmodes_huang.fit_predict(SOYBEAN5)
-        expected = np.array([1, 0, 1, 1])
+        # binary encoded variables are required
+        bin_variables = SOYBEAN.astype(bool).astype(int)
+        kmodes_huang = kmodes_huang.fit(bin_variables)
+        # binary encoded variables required for prediction as well
+        bin_variables_pred = SOYBEAN2.astype(bool).astype(int)
+        result = kmodes_huang.fit_predict(bin_variables_pred)
+        expected = np.array([1, 2, 1, 1])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index f2d3df2..4c28504 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -11,15 +11,15 @@ def matching_dissim(a, b, **_):
 
 
 def jaccard_dissim_binary(a, b, **__):
-    """Jaccard dissimilarity function for biinary encoded variables"""
-    if len(np.unique(a.astype(int))) > 2 or len(np.unique(b.astype(int))) > 2:
-        raise ValueError("Missing or non Binary values detected in Binary columns.")
-    return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
+    """Jaccard dissimilarity function for binary encoded variables"""
+    if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all():
+        return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
+    raise ValueError("Missing or non Binary values detected in Binary columns.")
 
 
 def jaccard_dissim_label(a, b, **__):
     """Jaccard dissimilarity function for label encoded variables"""
-    if (a.astype(int) < 0).any() or (b.astype(int) < 0).any():
+    if np.isnan(np.array(a, dtype=np.float64)).any() or np.isnan(np.array(b, dtype=np.float64)).any():
         raise ValueError("Missing values detected in Numeric columns.")
     intersect_len = np.empty(len(a), dtype=int)
     union_len = np.empty(len(a), dtype=int)
@@ -27,8 +27,7 @@ def jaccard_dissim_label(a, b, **__):
     for row in a:
         intersect_len[i] = len(np.intersect1d(row, b))
         union_len[i] = len(row) + len(b) - intersect_len[i]
-        # union_len[i] = np.unique(np.concatenate((row, b)))
-        i = i + 1
+        i += 1
     return intersect_len / union_len
 
 
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index 2db5edf..826de7e 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -36,6 +36,12 @@ def test_jaccard_dissim_binary(self):
         with self.assertRaises(ValueError):
             jaccard_dissim_binary(a, b)
 
+        # test where values are non binary but also not having np.NaN
+        a = np.array([[0, 1, 2, 0, 1, 2]])
+        b = np.array([[0, 1, 2, 0, 1, 0]])
+        with self.assertRaises(ValueError):
+            jaccard_dissim_binary(a, b)
+
     def test_jaccard_dissim_label(self):
         a = np.array([[0, 1, 2, 0, 1, 2]])
         b = np.array([[0, 1, 2, 0, 1, 0]])

From 45078dd070499319e3bc5ac1d6a5e663360bfa67 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <BIKASH.PANDEY@springandriver.com>
Date: Tue, 23 Jul 2019 12:03:56 +0530
Subject: [PATCH 08/12] Resolving errors in Quality Check

---
 kmodes/util/dissim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index 4c28504..e5a5cf9 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -19,7 +19,7 @@ def jaccard_dissim_binary(a, b, **__):
 
 def jaccard_dissim_label(a, b, **__):
     """Jaccard dissimilarity function for label encoded variables"""
-    if np.isnan(np.array(a, dtype=np.float64)).any() or np.isnan(np.array(b, dtype=np.float64)).any():
+    if np.isnan(a.astype('float64')).any() or np.isnan(b.astype('float64')).any():
         raise ValueError("Missing values detected in Numeric columns.")
     intersect_len = np.empty(len(a), dtype=int)
     union_len = np.empty(len(a), dtype=int)

From 71ade685f6ee2452ca757c7f09d79e15a41a3b48 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <BIKASH.PANDEY@springandriver.com>
Date: Fri, 26 Jul 2019 18:48:08 +0530
Subject: [PATCH 09/12] added suggested changes and corrected logic error

---
 kmodes/tests/test_kmodes.py      | 35 ++++++++++++++++----------------
 kmodes/util/dissim.py            |  4 +++-
 kmodes/util/tests/test_dissim.py | 23 ++++++++++++++++++++-
 3 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 2e43f9a..e93a9a9 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -124,7 +124,8 @@
 # Drop target column
 SOYBEAN2 = SOYBEAN2[:, :35]
 
-SOYBEAN4 = np.array([
+# test data with categorical variables that have been label encoded
+TEST_DATA = np.array([
     [2, 22, 14, 45,  2,  0,  1,  2,  5],
     [2, 13, 13, 19,  2,  0,  1,  2,  5],
     [3, 25,  4,  3,  0,  1,  2,  0,  4],
@@ -212,7 +213,7 @@
     [1, 11, 10,  0,  0,  2,  1,  0,  2]
 ])
 
-SOYBEAN6 = np.array([
+TEST_DATA_PREDICT = np.array([
     [2, 22, 14, 45,  2,  0,  1,  2,  5],
     [7, 13, 13, 19,  2,  0,  1,  2,  5],
     [5, 18, 19, 33,  0,  2,  1,  2,  2],
@@ -470,24 +471,24 @@ def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
     def test_kmodes_huang_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
-        result = kmodes_huang.fit_predict(SOYBEAN4)
-        expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-                             0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
-                             0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2,
-                             0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1,
-                             0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1])
+        result = kmodes_huang.fit_predict(TEST_DATA)
+        expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3,
+                             0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1,
+                             1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1,
+                             0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3,
+                             3, 0,0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
     def test_kmodes_cao_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
-        result = kmodes_huang.fit_predict(SOYBEAN4)
-        expected = np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 1, 0,
-                             1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
-                             0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0])
+        result = kmodes_huang.fit_predict(TEST_DATA)
+        expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+                             1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0,
+                             1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1,
+                             0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])
 
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
@@ -495,9 +496,9 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self):
     def test_kmodes_predict_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
-        kmodes_huang = kmodes_huang.fit(SOYBEAN4)
-        result = kmodes_huang.fit_predict(SOYBEAN6)
-        expected = np.array([0, 0, 0, 0])
+        kmodes_huang = kmodes_huang.fit(TEST_DATA)
+        result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
+        expected = np.array([0, 1, 0, 3])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index e5a5cf9..a2092c0 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -26,8 +26,10 @@ def jaccard_dissim_label(a, b, **__):
     i = 0
     for row in a:
         intersect_len[i] = len(np.intersect1d(row, b))
-        union_len[i] = len(row) + len(b) - intersect_len[i]
+        union_len[i] = len(np.unique(row)) + len(np.unique(b)) - intersect_len[i]
         i += 1
+    if (union_len == 0).any():
+        raise ValueError("Insufficient Number of data since union is 0")
     return intersect_len / union_len
 
 
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index 826de7e..1924541 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -42,9 +42,20 @@ def test_jaccard_dissim_binary(self):
         with self.assertRaises(ValueError):
             jaccard_dissim_binary(a, b)
 
+        # test for dissimilarity = 1
+        a = np.array([[1, 1, 0, 1, 1, 0]])
+        b = np.array([[1, 1, 0, 1, 1, 0]])
+        assert_equal(1, jaccard_dissim_binary(a, b))
+
+        # test for dissimilarity = 0
+        a = np.array([[0, 0, 1, 0, 0, 1]])
+        b = np.array([[1, 1, 0, 1, 1, 0]])
+        assert_equal(0, jaccard_dissim_binary(a, b))
+
+
     def test_jaccard_dissim_label(self):
         a = np.array([[0, 1, 2, 0, 1, 2]])
-        b = np.array([[0, 1, 2, 0, 1, 0]])
+        b = np.array([[0, 1, 2, 0, 3, 0]])
         assert_equal(0.75, jaccard_dissim_label(a, b))
 
         a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
@@ -52,6 +63,16 @@ def test_jaccard_dissim_label(self):
         with self.assertRaises(ValueError):
             jaccard_dissim_label(a, b)
 
+        # test for dissimilarity = 1
+        a = np.array([[1, 2, 0, 3, 1, 0]])
+        b = np.array([[1, 2, 0, 3, 1, 0]])
+        assert_equal(1, jaccard_dissim_label(a, b))
+
+        # test for dissimilarity = 0
+        a = np.array([[1, 2, 0, 3, 1, 0]])
+        b = np.array([[5, 4, 6, 7, 8, 9]])
+        assert_equal(0, jaccard_dissim_label(a, b))
+
 
     def test_euclidian_dissim(self):
         a = np.array([[0., 1., 2., 0., 1., 2.]])

From f4598d31e2c38f16452714065484623043464226 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Fri, 26 Jul 2019 22:23:04 +0530
Subject: [PATCH 10/12] resolving Quality check errors

---
 kmodes/tests/test_kmodes.py      | 2 +-
 kmodes/util/tests/test_dissim.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index e93a9a9..3ec2137 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -476,7 +476,7 @@ def test_kmodes_huang_soybean_jaccard_dissim_label(self):
                              0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1,
                              1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1,
                              0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3,
-                             3, 0,0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1])
+                             3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index 1924541..0f2f267 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -52,7 +52,6 @@ def test_jaccard_dissim_binary(self):
         b = np.array([[1, 1, 0, 1, 1, 0]])
         assert_equal(0, jaccard_dissim_binary(a, b))
 
-
     def test_jaccard_dissim_label(self):
         a = np.array([[0, 1, 2, 0, 1, 2]])
         b = np.array([[0, 1, 2, 0, 3, 0]])

From 9fe5b6dd274afb6a57b9961523144a578707da30 Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Fri, 26 Jul 2019 23:07:57 +0530
Subject: [PATCH 11/12] Final jaccard dissimilarity/distance logic established

---
 kmodes/tests/test_kmodes.py      | 35 ++++++++++++++++----------------
 kmodes/util/dissim.py            |  4 ++--
 kmodes/util/tests/test_dissim.py | 20 +++++++++---------
 3 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 3ec2137..4d3db10 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -436,9 +436,9 @@ def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
         # binary encoded variables are required
         bin_variables = SOYBEAN.astype(bool).astype(int)
         result = kmodes_huang.fit_predict(bin_variables)
-        expected = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1,
-                             1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
+                             3, 3, 1, 1, 3, 1, 3, 1, 1])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
@@ -448,9 +448,9 @@ def test_kmodes_cao_soybean_jaccard_dissim_binary(self):
         # binary encoded variables are required
         bin_variables = SOYBEAN.astype(bool).astype(int)
         result = kmodes_Cao.fit_predict(bin_variables)
-        expected = np.array([3, 2, 2, 3, 3, 2, 3, 2, 2, 3, 1, 2, 1, 2, 1, 2, 1,
-                             1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                             1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0])
 
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
@@ -464,7 +464,7 @@ def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
         # binary encoded variables required for prediction as well
         bin_variables_pred = SOYBEAN2.astype(bool).astype(int)
         result = kmodes_huang.fit_predict(bin_variables_pred)
-        expected = np.array([1, 2, 1, 1])
+        expected = np.array([0, 1, 2, 3])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
@@ -472,11 +472,11 @@ def test_kmodes_huang_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
         result = kmodes_huang.fit_predict(TEST_DATA)
-        expected = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 3, 1, 1, 3,
-                             0, 3, 1, 1, 3, 0, 2, 1, 0, 1, 0, 0, 0, 3, 0, 0, 0, 1,
-                             1, 1, 1, 0, 2, 3, 3, 3, 0, 3, 0, 0, 2, 0, 3, 0, 1, 1,
-                             0, 3, 1, 1, 0, 1, 0, 1, 3, 0, 3, 3, 1, 1, 0, 1, 1, 3,
-                             3, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1])
+        expected = np.array([3, 3, 2, 1, 1, 3, 3, 3, 3, 3, 0, 2, 2, 0, 0, 3, 3, 0, 0,
+                             0, 2, 2, 0, 3, 2, 3, 2, 2, 0, 1, 1, 0, 1, 1, 0, 2, 3, 3,
+                             3, 2, 2, 0, 0, 2, 1, 0, 0, 0, 2, 3, 0, 0, 2, 3, 2, 0, 2,
+                             2, 2, 3, 0, 3, 2, 2, 0, 0, 3, 2, 1, 3, 2, 0, 0, 2, 2, 2,
+                             3, 2, 2, 2, 2, 1, 3, 2, 2])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
@@ -484,11 +484,10 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
         result = kmodes_huang.fit_predict(TEST_DATA)
-        expected = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-                             1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0,
-                             1, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 1, 0, 0, 1,
-                             0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])
+        expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, 0, 0, 1, 1,
+                             0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 3, 1, 1, 2, 2, 0, 0, 2,
+                             0, 0, 0, 0, 3, 2, 2, 2, 0, 1, 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0,
+                             1, 1, 0, 1, 1, 0, 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1])
 
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
@@ -498,7 +497,7 @@ def test_kmodes_predict_soybean_jaccard_dissim_label(self):
                               cat_dissim=jaccard_dissim_label, random_state=42)
         kmodes_huang = kmodes_huang.fit(TEST_DATA)
         result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
-        expected = np.array([0, 1, 0, 3])
+        expected = np.array([1, 0, 1, 2])
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
 
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index a2092c0..a764b5a 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -13,7 +13,7 @@ def matching_dissim(a, b, **_):
 def jaccard_dissim_binary(a, b, **__):
     """Jaccard dissimilarity function for binary encoded variables"""
     if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all():
-        return np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
+        return 1 - np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
     raise ValueError("Missing or non Binary values detected in Binary columns.")
 
 
@@ -30,7 +30,7 @@ def jaccard_dissim_label(a, b, **__):
         i += 1
     if (union_len == 0).any():
         raise ValueError("Insufficient Number of data since union is 0")
-    return intersect_len / union_len
+    return 1 - intersect_len / union_len
 
 
 def euclidean_dissim(a, b, **_):
diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py
index 0f2f267..24a2be0 100644
--- a/kmodes/util/tests/test_dissim.py
+++ b/kmodes/util/tests/test_dissim.py
@@ -29,7 +29,7 @@ def test_matching_dissim(self):
     def test_jaccard_dissim_binary(self):
         a = np.array([[0, 1, 1, 0, 1, 1]])
         b = np.array([[0, 1, 1, 0, 1, 0]])
-        assert_equal(0.75, jaccard_dissim_binary(a, b))
+        assert_equal(0.25, jaccard_dissim_binary(a, b))
 
         a = np.array([[0, 1, 1, 0, 1, 1]])
         b = np.array([[0, np.NaN, 1, 0, 1, 0]])
@@ -42,35 +42,35 @@ def test_jaccard_dissim_binary(self):
         with self.assertRaises(ValueError):
             jaccard_dissim_binary(a, b)
 
-        # test for dissimilarity = 1
+        # test for dissimilarity = 0 both sets are same
         a = np.array([[1, 1, 0, 1, 1, 0]])
         b = np.array([[1, 1, 0, 1, 1, 0]])
-        assert_equal(1, jaccard_dissim_binary(a, b))
+        assert_equal(0, jaccard_dissim_binary(a, b))
 
-        # test for dissimilarity = 0
+        # test for dissimilarity = 0 sets are different
         a = np.array([[0, 0, 1, 0, 0, 1]])
         b = np.array([[1, 1, 0, 1, 1, 0]])
-        assert_equal(0, jaccard_dissim_binary(a, b))
+        assert_equal(1, jaccard_dissim_binary(a, b))
 
     def test_jaccard_dissim_label(self):
         a = np.array([[0, 1, 2, 0, 1, 2]])
         b = np.array([[0, 1, 2, 0, 3, 0]])
-        assert_equal(0.75, jaccard_dissim_label(a, b))
+        assert_equal(0.25, jaccard_dissim_label(a, b))
 
         a = np.array([[np.NaN, 1, 2, 0, 1, 2]])
         b = np.array([[0, 1, 2, 0, 1, 0]])
         with self.assertRaises(ValueError):
             jaccard_dissim_label(a, b)
 
-        # test for dissimilarity = 1
+        # test for dissimilarity = 0 Both sets are same
         a = np.array([[1, 2, 0, 3, 1, 0]])
         b = np.array([[1, 2, 0, 3, 1, 0]])
-        assert_equal(1, jaccard_dissim_label(a, b))
+        assert_equal(0, jaccard_dissim_label(a, b))
 
-        # test for dissimilarity = 0
+        # test for dissimilarity = 1 Both sets are different
         a = np.array([[1, 2, 0, 3, 1, 0]])
         b = np.array([[5, 4, 6, 7, 8, 9]])
-        assert_equal(0, jaccard_dissim_label(a, b))
+        assert_equal(1, jaccard_dissim_label(a, b))
 
 
     def test_euclidian_dissim(self):

From 4f2efa060783bc13942cf289e9f143202bf5c74c Mon Sep 17 00:00:00 2001
From: BikashPandey17 <pandeybikash98@gmail.com>
Date: Fri, 26 Jul 2019 23:19:16 +0530
Subject: [PATCH 12/12] resolving Quality check errors and added denominator 0
 check

---
 kmodes/tests/test_kmodes.py | 9 +++++----
 kmodes/util/dissim.py       | 7 ++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py
index 4d3db10..6aecc0a 100644
--- a/kmodes/tests/test_kmodes.py
+++ b/kmodes/tests/test_kmodes.py
@@ -484,10 +484,11 @@ def test_kmodes_cao_soybean_jaccard_dissim_label(self):
         kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2,
                               cat_dissim=jaccard_dissim_label, random_state=42)
         result = kmodes_huang.fit_predict(TEST_DATA)
-        expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, 0, 0, 1, 1,
-                             0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 3, 1, 1, 2, 2, 0, 0, 2,
-                             0, 0, 0, 0, 3, 2, 2, 2, 0, 1, 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0,
-                             1, 1, 0, 1, 1, 0, 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1])
+        expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0,
+                             0, 0, 1, 1, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1,
+                             3, 1, 1, 2, 2, 0, 0, 2, 0, 0, 0, 0, 3, 2, 2, 2, 0, 1,
+                             1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0,
+                             0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1])
 
         assert_cluster_splits_equal(result, expected)
         self.assertTrue(result.dtype == np.dtype(np.uint16))
diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py
index a764b5a..08a15f6 100644
--- a/kmodes/util/dissim.py
+++ b/kmodes/util/dissim.py
@@ -13,7 +13,12 @@ def matching_dissim(a, b, **_):
 def jaccard_dissim_binary(a, b, **__):
     """Jaccard dissimilarity function for binary encoded variables"""
     if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all():
-        return 1 - np.sum(np.bitwise_and(a, b), axis=1) / np.sum(np.bitwise_or(a, b), axis=1)
+        numerator = np.sum(np.bitwise_and(a, b), axis=1)
+        denominator = np.sum(np.bitwise_or(a, b), axis=1)
+        if (denominator == 0).any(0):
+            raise ValueError("Insufficient Number of data since union is 0")
+        else:
+            return 1 - numerator / denominator
     raise ValueError("Missing or non Binary values detected in Binary columns.")