-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMyFuzzyCmeans.py
121 lines (105 loc) · 4.25 KB
/
MyFuzzyCmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from scipy.spatial.distance import euclidean
import pandas as pd
import numpy as np
from itertools import permutations
class MyFuzzyCmeans:
def __init__(self, k=2, tol=0.001, max_rep=100, m=2.0):
self.k = k
self.tol = tol
self.max_rep = max_rep
# Fuzzy parameter
self.m = m
self.name = 'FuzzyCMeans'
def init_centers(self, data, init_type):
if init_type == 'random':
values = np.random.rand(self.k, data.shape[0])
values_sum = values.sum(axis=0)
return values/values_sum
def new_universe_matrix(self, universe_matrix, data):
power_num = float(2 / (self.m - 1))
# dists: k rows with value the distance between each data point and the i-th centre
dists = np.array([np.linalg.norm(data-v_centre, ord=1, axis=1)
for v_centre in self.v_centres])
#den = dists.sum(axis=0)
new_u = []
n = data.shape[0]
for data_index in range(n):
for cluster_index in range(self.k):
self.universe_matrix[cluster_index, data_index] = (
1/sum(
[(dists[cluster_index, data_index]/dists[c_index, data_index])**power_num
for c_index in range(self.k)]
)
)
def v_centres_calc(self, data):
universe_matrix_m = self.universe_matrix ** self.m
v_centres_num = np.matmul(universe_matrix_m, data)
# make the den a column vector with reshape
v_centres_den = universe_matrix_m.sum(axis=1).reshape(-1, 1)
self.v_centres = v_centres_num / v_centres_den
def find_mindist(self, data, seed):
#print(self.centroids[seed])
#seed_df = pd.DataFrame([self.centroids[seed]]*len(df.index))
return distance_metric(data, self.v_centres[seed])
def fit(self, dt):
if isinstance(dt, pd.DataFrame):
data = dt.values
elif isinstance(dt, np.ndarray):
data = dt
else:
raise Exception('dt should be a DataFrame or a numpy array')
# get random indexes from data
self.universe_matrix = self.init_centers(data, 'random')
converge = False
while self.max_rep > 0 and converge == False:
if not hasattr(self, 'v_centres'):
self.v_centres_calc(data)
v_centres_old = self.v_centres.copy()
self.new_universe_matrix(self.universe_matrix, data)
self.v_centres_calc(data)
#print(self.v_centres)
converge = True
for cluster_index in range(self.k):
dist_diff = np.linalg.norm(self.v_centres[cluster_index]-v_centres_old[cluster_index],
ord=1)
if dist_diff <= self.tol:
converge = converge and True
else:
converge = converge and False
self.max_rep -= 1
self.labels_ = self.universe_matrix.argmax(axis=0)
#print(self.v_centres)
print('Remaining repetitions: %s' % (self.max_rep))
self.inertia_ = 0
for seed in range(len(self.v_centres)):
self.inertia_ += np.array([self.find_mindist(data[np.where(self.labels_==seed)], seed)**2]).sum()
def distance_metric(a, b, dist='Euclidean'):
"""
Define the distance metric used
This can be: 'Euclidean' (default)
"""
# a numpy matrix, b numpy vector of the centroid
if a.shape[1] == b.shape[0]:
"""
We assume that:
- the numerical values of a and are normalized
- a and b have the same columns from now on
"""
#a_num = a.select_dtypes(exclude='object')
#a_cat = a.select_dtypes(include='object')
## make the same size as a
#b_num = b.select_dtypes(exclude='object')
#b_cat = b.select_dtypes(include='object')
#print(a)
#print(a-b)
distance = ((a - b)**2).sum(axis=1)
#dist_cat = pd.DataFrame(np.where(a_cat==b_cat, 0, 1)).sum(axis=1)
#return (distance + dist_cat)**0.5
return distance**0.5
'''
clf = MyFuzzyCmeans()
data = np.array([[2,3],[3,4],[1,5], [10,9], [12,13], [13,14],[11,15]])
clf.fit(data)
print(clf.universe_matrix)
print(clf.labels_)
'''