-
Notifications
You must be signed in to change notification settings - Fork 1
/
k_means.ml
182 lines (160 loc) · 6.19 KB
/
k_means.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
(* We require elements to have the structure of a
metric space and to support the computing of geometric means. *)
module type Element = sig
include Intf.Metric
val mean : t array -> t
end
type init = Forgy | RandomPartition | KmeansPP
(* K-means++ *)
type termination = Num_iter of int | Threshold of float | Min of constraints
and constraints = { max_iter : int; threshold : float }
exception KmeansError of string
module Make (E : Element) = struct
type elt = E.t
(* [closest elements centroid elti] returns the pair (m,d) such that
[centroids.(m)] is closest to [elements.(elti)], and the distance
is equal to d. *)
let closest elements centroids elti =
let m = ref 0 in
let d = ref max_float in
for i = 0 to Array.length centroids - 1 do
let dist = E.dist elements.(elti) centroids.(i) in
if dist < !d then (
d := dist ;
m := i)
done ;
(!m, !d)
(* [compute_classes centroids elements] computes for each centroid
the set of elements which are closest to it, i.e. it computes the
voronoi partition of [elements] according to [centroids]. *)
let compute_classes centroids elements =
let classes : int list array = Array.make (Array.length centroids) [] in
Array.iteri
(fun elti _ ->
let (k, _) = closest elements centroids elti in
classes.(k) <- elti :: classes.(k))
elements ;
let classes =
Array.to_list classes
|> List.filter (function [] -> false | _ -> true)
|> Array.of_list
in
Array.map Array.of_list classes
(* [compute_centroids], given a partition [classes] of [elements], returns
the centroid of each class. *)
let compute_centroids (elements : elt array) (classes : int array array) =
Array.map
(fun arr -> E.mean (Array.map (fun i -> elements.(i)) arr))
classes
(* Voronoi iteration: partition according to the centroids, then update the centroids,
etc under the centroids do not move collectively more than [threshold]. *)
let rec iterate (centroids : E.t array) (elements : E.t array) niter
(termination : termination) =
let classes = compute_classes centroids elements in
let centroids' = compute_centroids elements classes in
let terminate =
match termination with
| Num_iter max_iter -> niter >= max_iter
| Threshold threshold ->
let dist =
Array.mapi (fun i c -> E.dist c centroids'.(i)) centroids'
|> Array.fold_left ( +. ) 0.0
in
dist < threshold
| Min { max_iter; threshold } ->
niter >= max_iter
||
let dist =
Array.mapi (fun i c -> E.dist c centroids'.(i)) centroids'
|> Array.fold_left ( +. ) 0.0
in
dist < threshold
in
if terminate then classes
else iterate centroids elements (niter + 1) termination
(* [random_partition_init] picks [k] initial centroids by selecting a
partition in [k] classes at random, and then computing the centroid
of each class. *)
let random_partition_init k elements rng_state =
let classes = Array.make k [] in
Array.iteri
(fun elti _ ->
let i = Random.State.int rng_state k in
classes.(i) <- elti :: classes.(i))
elements ;
let classes = Array.map Array.of_list classes in
compute_centroids elements classes
let pick_uniformly arr rng_state =
let c = Array.length arr in
if c = 0 then
raise
(KmeansError "pick_uniformly: empty array - bug found, please report")
else arr.(Random.State.int rng_state c)
(* Given a discrete probability distribution stored in [arr], pick an index according
to that distribution. *)
(* Note that the distance to a point to itself is 0, so the probability for a centroid
to pick itself is also zero. *)
let pick_proportional arr =
let total = Helpers.array_fsum arr in
let r = Random.float total in
let rec loop i acc =
if acc <= arr.(i) then i else loop (i + 1) (acc -. arr.(i))
in
loop 0 r
(* [kmeanspp_iter] selects [k] centroids iteratively: the first one is taken uniformly at
random, and in the inductive step the next one is picked with a probability proportional
to its squared distance to the closest centroid. *)
let rec kmeanspp_iter k centroids elements =
if k = 0 then centroids
else
let dists =
Array.mapi
(fun elti _ ->
let (_, d) = closest elements centroids elti in
d *. d)
elements
in
let i = pick_proportional dists in
let centroids = Array.concat [centroids; [| elements.(i) |]] in
kmeanspp_iter (k - 1) centroids elements
let kmeanspp_init k elements rng_state =
if k < 1 then raise (KmeansError "kmeanspp_init: k < 1, error")
else
let elt = pick_uniformly elements rng_state in
kmeanspp_iter (k - 1) [| elt |] elements
let k_means_internal ~k ~(init : init) ~elements ~(termination : termination)
rng_state =
let centroids =
match init with
| Forgy -> Helpers.forgy_init k elements rng_state
| RandomPartition -> random_partition_init k elements rng_state
| KmeansPP -> kmeanspp_init k elements rng_state
in
iterate centroids elements 0 termination
let k_means ~k ~init ~elements ~termination rng_state =
if Array.length elements = 0 then
raise (KmeansError "k_means: empty elements array")
else
let classes =
k_means_internal ~k ~init ~elements ~termination rng_state
in
Array.map (Array.map (fun i -> elements.(i))) classes
(* 2 x cluster_radius overapproximates the diameter, hopefully tightly *)
(* let cluster_radius elements =
* let mean = E.mean elements in
* Array.fold_left (fun maxdist elt ->
* max maxdist (E.dist elt mean)
* ) (~-. max_float) elements *)
(* let sum_of_cluster_radius classes =
* Helpers.array_fsum (Array.map cluster_radius classes) *)
let total_squared_dist_to_mean elements =
let mean = E.mean elements in
Array.fold_left
(fun acc elt ->
let d = E.dist elt mean in
acc +. (d *. d))
0.0
elements
let cost ~classes =
Helpers.array_fsum (Array.map total_squared_dist_to_mean classes)
end