17
17
18
18
from numpy import array
19
19
20
+ from pyspark import RDD
20
21
from pyspark import SparkContext
21
22
from pyspark .mllib .common import callMLlibFunc , callJavaFunc
22
23
from pyspark .mllib .linalg import DenseVector , SparseVector , _convert_to_vector
24
+ from pyspark .mllib .stat .distribution import MultivariateGaussian
23
25
24
- __all__ = ['KMeansModel' , 'KMeans' , 'GaussianMixtureModel' , 'GaussianMixtureEM' ,
25
- 'MultiVariateGaussian' ]
26
+ __all__ = ['KMeansModel' , 'KMeans' , 'GaussianMixtureModel' , 'GaussianMixture' ]
26
27
27
28
28
29
class KMeansModel (object ):
29
30
30
31
"""A clustering model derived from the k-means method.
31
32
32
- >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4,2)
33
+ >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
33
34
>>> model = KMeans.train(
34
35
... sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random")
35
36
>>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
@@ -94,8 +95,8 @@ class GaussianMixtureModel(object):
94
95
95
96
>>> clusterdata_1 = sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,
96
97
... 0.9,0.8,0.75,0.935,
97
- ... -0.83,-0.68,-0.91,-0.76 ]).reshape(6,2))
98
- >>> model = GaussianMixtureEM .train(clusterdata_1, 3, convergenceTol=0.0001,
98
+ ... -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2))
99
+ >>> model = GaussianMixture .train(clusterdata_1, 3, convergenceTol=0.0001,
99
100
... maxIterations=50, seed=10)
100
101
>>> labels = model.predict(clusterdata_1).collect()
101
102
>>> labels[0]==labels[1]
@@ -108,8 +109,8 @@ class GaussianMixtureModel(object):
108
109
... -5.2211, -5.0602, 4.7118,
109
110
... 6.8989, 3.4592, 4.6322,
110
111
... 5.7048, 4.6567, 5.5026,
111
- ... 4.5605, 5.2043, 6.2734]).reshape(5,3))
112
- >>> model = GaussianMixtureEM .train(clusterdata_2, 2, convergenceTol=0.0001,
112
+ ... 4.5605, 5.2043, 6.2734]).reshape(5, 3))
113
+ >>> model = GaussianMixture .train(clusterdata_2, 2, convergenceTol=0.0001,
113
114
... maxIterations=150, seed=10)
114
115
>>> labels = model.predict(clusterdata_2).collect()
115
116
>>> labels[0]==labels[1]==labels[2]
@@ -123,46 +124,60 @@ def __init__(self, weights, gaussians):
123
124
self .gaussians = gaussians
124
125
self .k = len (self .weights )
125
126
126
- def predict (self , X ):
127
+ def predict (self , x ):
127
128
"""
128
- Find the cluster to which the points in X has maximum membership
129
+ Find the cluster to which the points in 'x' has maximum membership
129
130
in this model.
130
- Returns an RDD of cluster labels.
131
- """
132
- cluster_labels = self .predictSoft (X ).map (lambda x : x .index (max (x )))
133
- return cluster_labels
134
131
135
- def predictSoft (self , X ):
136
- """
137
- Find the membership of each point in X to all mixture components.
138
- Returns an RDD of array of double values.
139
- """
140
- means_temp = ()
141
- sigmas_temp = ()
142
- for i in range (self .k ):
143
- means_temp = means_temp + (self .gaussians [i ].mu ,)
144
- sigmas_temp = sigmas_temp + (self .gaussians [i ].sigma ,)
145
- membership_matrix = callMLlibFunc ("predictGMM" , X .map (_convert_to_vector ),
146
- self .weights , means_temp , sigmas_temp )
147
- return membership_matrix
132
+ Parameters
133
+ ----------
134
+ x : RDD of data points
148
135
136
+ Returns
137
+ -------
138
+ cluster_labels : RDD of cluster labels.
139
+ """
140
+ if isinstance (x , RDD ):
141
+ cluster_labels = self .predictSoft (x ).map (lambda z : z .index (max (z )))
142
+ return cluster_labels
149
143
150
- class MultiVariateGaussian (object ):
144
+ def predictSoft (self , x ):
145
+ """
146
+ Find the membership of each point in 'x' to all mixture components.
151
147
152
- def __init__ ( self , mu , sigma ):
153
- self . mu = mu
154
- self . sigma = sigma
148
+ Parameters
149
+ ----------
150
+ x : RDD of data points
155
151
152
+ Returns
153
+ -------
154
+ membership_matrix : RDD of array of double values.
155
+ """
156
+ means , sigmas = zip (* [(g .mu , g .sigma ) for g in self .gaussians ])
157
+ membership_matrix = callMLlibFunc ("predictSoftGMM" , x .map (_convert_to_vector ),
158
+ self .weights , means , sigmas )
159
+ return membership_matrix
156
160
157
- class GaussianMixtureEM (object ):
158
161
162
+ class GaussianMixture (object ):
163
+ """
164
+ Estimate model parameters with the expectation-maximization algorithm.
165
+
166
+ Parameters
167
+ ----------
168
+ data - RDD of data points
169
+ k - Number of components
170
+ convergenceTol - Threshold value to check the convergence criteria. Defaults to 1e-3
171
+ maxIterations - Number of iterations. Default to 100
172
+ seed - Random Seed
173
+ """
159
174
@classmethod
160
- def train (cls , rdd , k , convergenceTol = 1e-3 , seed = None , maxIterations = 100 ):
175
+ def train (cls , rdd , k , convergenceTol = 1e-3 , maxIterations = 100 , seed = None ):
161
176
"""Train a Gaussian Mixture clustering model."""
162
- weight , mu , sigma = callMLlibFunc ("trainGaussianMixtureEM " ,
177
+ weight , mu , sigma = callMLlibFunc ("trainGaussianMixture " ,
163
178
rdd .map (_convert_to_vector ), k ,
164
- convergenceTol , seed , maxIterations )
165
- mvg_obj = array ([MultiVariateGaussian (mu [i ], sigma [i ]) for i in range (k )])
179
+ convergenceTol , maxIterations , seed )
180
+ mvg_obj = array ([MultivariateGaussian (mu [i ], sigma [i ]) for i in range (k )])
166
181
return GaussianMixtureModel (weight , mvg_obj )
167
182
168
183
0 commit comments