Skip to content

Commit 3aee84b

Browse files
committed
Fixed style issues
1 parent b22532c commit 3aee84b

File tree

5 files changed

+83
-71
lines changed

5 files changed

+83
-71
lines changed

examples/src/main/python/mllib/gaussian_mixture_model.py

+12-14
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,14 @@
1717

1818
"""
1919
A Gaussian Mixture Model clustering program using MLlib.
20-
2120
"""
22-
2321
import sys
2422
import random
2523
import argparse
2624
import numpy as np
2725

2826
from pyspark import SparkConf, SparkContext
29-
from pyspark.mllib.clustering import GaussianMixtureEM
27+
from pyspark.mllib.clustering import GaussianMixture
3028

3129

3230
def parseVector(line):
@@ -37,29 +35,29 @@ def parseVector(line):
3735
"""
3836
Parameters
3937
----------
40-
input_file : Input file path which contains data points
38+
inputFile : Input file path which contains data points
4139
k : Number of mixture components
42-
convergenceTol : convergence_threshold. Default to 1e-3
43-
seed : random seed
40+
convergenceTol : Convergence threshold. Default to 1e-3
4441
maxIterations : Number of EM iterations to perform. Default to 100
42+
seed : Random seed
4543
"""
4644

4745
parser = argparse.ArgumentParser()
48-
parser.add_argument('input_file', help='input file')
49-
parser.add_argument('k', type=int, help='num_of_clusters')
50-
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence_threshold')
46+
parser.add_argument('inputFile', help='Input File')
47+
parser.add_argument('k', type=int, help='Number of clusters')
48+
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
49+
parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
5150
parser.add_argument('--seed', default=random.getrandbits(19),
52-
type=long, help='num_of_iterations')
53-
parser.add_argument('--maxIterations', default=100, type=int, help='max_num_of_iterations')
51+
type=long, help='Random seed')
5452
args = parser.parse_args()
5553

5654
conf = SparkConf().setAppName("GMM")
5755
sc = SparkContext(conf=conf)
5856

59-
lines = sc.textFile(args.input_file)
57+
lines = sc.textFile(args.inputFile)
6058
data = lines.map(parseVector)
61-
model = GaussianMixtureEM.train(data, args.k, args.convergenceTol,
62-
args.seed, args.maxIterations)
59+
model = GaussianMixture.train(data, args.k, args.convergenceTol,
60+
args.maxIterations, args.seed)
6361
for i in range(args.k):
6462
print ("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
6563
"sigma = ", model.gaussians[i].sigma.toArray())

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

+11-13
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ class PythonMLLibAPI extends Serializable {
261261
}
262262

263263
/**
264-
* Java stub for Python mllib KMeans.train()
264+
* Java stub for Python mllib KMeans.run()
265265
*/
266266
def trainKMeansModel(
267267
data: JavaRDD[Vector],
@@ -286,22 +286,23 @@ class PythonMLLibAPI extends Serializable {
286286
}
287287

288288
/**
289-
* Java stub for Python mllib GaussianMixtureEM.train()
289+
* Java stub for Python mllib GaussianMixture.run()
290290
* Returns a list containing weights, mean and covariance of each mixture component.
291291
*/
292-
def trainGaussianMixtureEM(
292+
def trainGaussianMixture(
293293
data: JavaRDD[Vector],
294294
k: Int,
295295
convergenceTol: Double,
296-
seed: Long,
297-
maxIterations: Int): JList[Object] = {
298-
val gmmAlg = new GaussianMixtureEM()
296+
maxIterations: Int,
297+
seed: Long): JList[Object] = {
298+
val gmmAlg = new GaussianMixture()
299299
.setK(k)
300300
.setConvergenceTol(convergenceTol)
301-
.setSeed(seed)
302301
.setMaxIterations(maxIterations)
302+
.setSeed(seed)
303303
try {
304304
val model = gmmAlg.run(data.rdd.persist(StorageLevel.MEMORY_AND_DISK))
305+
305306
var wtArray:Array[Double] = Array()
306307
var muArray:Array[Vector] = Array()
307308
var siArray :Array[Matrix] = Array()
@@ -310,6 +311,7 @@ class PythonMLLibAPI extends Serializable {
310311
muArray = muArray ++ Array(model.gaussians(i).mu)
311312
siArray = siArray ++ Array(model.gaussians(i).sigma)
312313
}
314+
313315
List(wtArray, muArray, siArray).map(_.asInstanceOf[Object]).asJava
314316
} finally {
315317
data.rdd.unpersist(blocking = false)
@@ -319,23 +321,19 @@ class PythonMLLibAPI extends Serializable {
319321
/**
320322
* Java stub for Python mllib GaussianMixtureModel.predictSoft()
321323
*/
322-
def predictGMM(
324+
def predictSoftGMM(
323325
data: JavaRDD[Vector],
324326
wt: Object,
325327
mu: Array[Object],
326328
si: Array[Object]): RDD[Array[Double]] = {
327-
try {
328329
val weight = wt.asInstanceOf[Array[Double]]
329330
val mean = mu.map(_.asInstanceOf[DenseVector])
330331
val sigma = si.map(_.asInstanceOf[DenseMatrix])
331332
val gaussians = Array.tabulate(weight.length){
332333
i => new MultivariateGaussian(mean(i),sigma(i))
333334
}
334335
val model = new GaussianMixtureModel(weight, gaussians)
335-
model.predictSoft(data.rdd.persist(StorageLevel.MEMORY_AND_DISK))
336-
} finally {
337-
data.rdd.unpersist(blocking = false)
338-
}
336+
model.predictSoft(data)
339337
}
340338

341339
/**

python/pyspark/mllib/clustering.py

+50-35
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,20 @@
1717

1818
from numpy import array
1919

20+
from pyspark import RDD
2021
from pyspark import SparkContext
2122
from pyspark.mllib.common import callMLlibFunc, callJavaFunc
2223
from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
24+
from pyspark.mllib.stat.distribution import MultivariateGaussian
2325

24-
__all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixtureEM',
25-
'MultiVariateGaussian']
26+
__all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture']
2627

2728

2829
class KMeansModel(object):
2930

3031
"""A clustering model derived from the k-means method.
3132
32-
>>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4,2)
33+
>>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
3334
>>> model = KMeans.train(
3435
... sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random")
3536
>>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
@@ -94,8 +95,8 @@ class GaussianMixtureModel(object):
9495
9596
>>> clusterdata_1 = sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,
9697
... 0.9,0.8,0.75,0.935,
97-
... -0.83,-0.68,-0.91,-0.76 ]).reshape(6,2))
98-
>>> model = GaussianMixtureEM.train(clusterdata_1, 3, convergenceTol=0.0001,
98+
... -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2))
99+
>>> model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.0001,
99100
... maxIterations=50, seed=10)
100101
>>> labels = model.predict(clusterdata_1).collect()
101102
>>> labels[0]==labels[1]
@@ -108,8 +109,8 @@ class GaussianMixtureModel(object):
108109
... -5.2211, -5.0602, 4.7118,
109110
... 6.8989, 3.4592, 4.6322,
110111
... 5.7048, 4.6567, 5.5026,
111-
... 4.5605, 5.2043, 6.2734]).reshape(5,3))
112-
>>> model = GaussianMixtureEM.train(clusterdata_2, 2, convergenceTol=0.0001,
112+
... 4.5605, 5.2043, 6.2734]).reshape(5, 3))
113+
>>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
113114
... maxIterations=150, seed=10)
114115
>>> labels = model.predict(clusterdata_2).collect()
115116
>>> labels[0]==labels[1]==labels[2]
@@ -123,46 +124,60 @@ def __init__(self, weights, gaussians):
123124
self.gaussians = gaussians
124125
self.k = len(self.weights)
125126

126-
def predict(self, X):
127+
def predict(self, x):
127128
"""
128-
Find the cluster to which the points in X has maximum membership
129+
Find the cluster to which the points in 'x' has maximum membership
129130
in this model.
130-
Returns an RDD of cluster labels.
131-
"""
132-
cluster_labels = self.predictSoft(X).map(lambda x: x.index(max(x)))
133-
return cluster_labels
134131
135-
def predictSoft(self, X):
136-
"""
137-
Find the membership of each point in X to all mixture components.
138-
Returns an RDD of array of double values.
139-
"""
140-
means_temp = ()
141-
sigmas_temp = ()
142-
for i in range(self.k):
143-
means_temp = means_temp + (self.gaussians[i].mu,)
144-
sigmas_temp = sigmas_temp + (self.gaussians[i].sigma,)
145-
membership_matrix = callMLlibFunc("predictGMM", X.map(_convert_to_vector),
146-
self.weights, means_temp, sigmas_temp)
147-
return membership_matrix
132+
Parameters
133+
----------
134+
x : RDD of data points
148135
136+
Returns
137+
-------
138+
cluster_labels : RDD of cluster labels.
139+
"""
140+
if isinstance(x, RDD):
141+
cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z)))
142+
return cluster_labels
149143

150-
class MultiVariateGaussian(object):
144+
def predictSoft(self, x):
145+
"""
146+
Find the membership of each point in 'x' to all mixture components.
151147
152-
def __init__(self, mu, sigma):
153-
self.mu = mu
154-
self.sigma = sigma
148+
Parameters
149+
----------
150+
x : RDD of data points
155151
152+
Returns
153+
-------
154+
membership_matrix : RDD of array of double values.
155+
"""
156+
means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
157+
membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
158+
self.weights, means, sigmas)
159+
return membership_matrix
156160

157-
class GaussianMixtureEM(object):
158161

162+
class GaussianMixture(object):
163+
"""
164+
Estimate model parameters with the expectation-maximization algorithm.
165+
166+
Parameters
167+
----------
168+
data - RDD of data points
169+
k - Number of components
170+
convergenceTol - Threshold value to check the convergence criteria. Defaults to 1e-3
171+
maxIterations - Number of iterations. Default to 100
172+
seed - Random Seed
173+
"""
159174
@classmethod
160-
def train(cls, rdd, k, convergenceTol=1e-3, seed=None, maxIterations=100):
175+
def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None):
161176
"""Train a Gaussian Mixture clustering model."""
162-
weight, mu, sigma = callMLlibFunc("trainGaussianMixtureEM",
177+
weight, mu, sigma = callMLlibFunc("trainGaussianMixture",
163178
rdd.map(_convert_to_vector), k,
164-
convergenceTol, seed, maxIterations)
165-
mvg_obj = array([MultiVariateGaussian(mu[i], sigma[i]) for i in range(k)])
179+
convergenceTol, maxIterations, seed)
180+
mvg_obj = array([MultivariateGaussian(mu[i], sigma[i]) for i in range(k)])
166181
return GaussianMixtureModel(weight, mvg_obj)
167182

168183

python/pyspark/mllib/stat/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@
2020
"""
2121

2222
from pyspark.mllib.stat._statistics import *
23+
from pyspark.mllib.stat.distribution import MultivariateGaussian
2324

24-
__all__ = ["Statistics", "MultivariateStatisticalSummary"]
25+
__all__ = ["Statistics", "MultivariateStatisticalSummary", "MultivariateGaussian"]

python/pyspark/mllib/tests.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -168,28 +168,28 @@ def test_kmeans_deterministic(self):
168168
self.assertTrue(array_equal(c1, c2))
169169

170170
def test_gmm(self):
171-
from pyspark.mllib.clustering import GaussianMixtureEM
171+
from pyspark.mllib.clustering import GaussianMixture
172172
data = self.sc.parallelize([
173173
[1, 2],
174174
[8, 9],
175175
[-4, -3],
176176
[-6, -7],
177177
])
178-
clusters = GaussianMixtureEM.train(data, 2, convergenceTol=0.001,
179-
seed=56, maxIterations=100)
178+
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
179+
maxIterations=100, seed=56)
180180
labels = clusters.predict(data).collect()
181181
self.assertEquals(labels[0], labels[1])
182182
self.assertEquals(labels[2], labels[3])
183183

184184
def test_gmm_deterministic(self):
185-
from pyspark.mllib.clustering import GaussianMixtureEM
185+
from pyspark.mllib.clustering import GaussianMixture
186186
X = range(0, 100, 10)
187187
Y = range(0, 100, 10)
188188
data = self.sc.parallelize([[x, y] for x, y in zip(X, Y)])
189-
clusters1 = GaussianMixtureEM.train(data, 5, convergenceTol=0.001,
190-
seed=63, maxIterations=100)
191-
clusters2 = GaussianMixtureEM.train(data, 5, convergenceTol=0.001,
192-
seed=63, maxIterations=100)
189+
clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001,
190+
maxIterations=100, seed=63)
191+
clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
192+
maxIterations=100, seed=63)
193193
for c1, c2 in zip(clusters1.weights, clusters2.weights):
194194
self.assertEquals(round(c1, 7), round(c2, 7))
195195

0 commit comments

Comments
 (0)