-
Notifications
You must be signed in to change notification settings - Fork 0
/
EM.r
187 lines (138 loc) · 5.54 KB
/
EM.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# EM Algorithm inspired from the article:
# Dawid and Skene "Maximum likelihood estimation of
# observer error-rates using the EM algorithm" (1979)
# http://www.jstor.org/stable/10.2307/2346806
# input data= data frame with
# - column: worker, with the index of the worker.
# - column: item, with the index of the item. note index>=1
# - column: label, with the index of the label. note index>=1
#note all the index should start >=1
# example of input:
# Worker, item , label
# 1 1 1
# 2 3 2
# 1 4 1
# 1 5 3
# result=EM_classification(data,nb_iterations=10)
#
EM_classification=function(data,nb_iterations=10){
nb_labels=length(unique(data$label))
# in case the first category start at 0
data$label[data$label==0]=nb_labels
categories=1:nb_labels
# initialize the object probability matrix [nb_objects x nb_categories]
# row idx = object idx, column idx= probability that the object belongs to the category idx
items_pb =init_items(data)
# initialize the prior probability for each category
prior=init_prior(data)
# initialize the error rates matrix
# row template: workerID, category_from, category_to, pb_error
# by default if from==to 0.9 else 0.1/(|Categories|-1)
error_rates=init_error_rates(data)
working=init_working(data)
# for each iterations
for(i in 1:nb_iterations){
working=fill_working(working,error_rates,categories)
#estimate class proba for each item
items_pb=estimate_item2(working,prior)
#estimate prior
prior=estimate_prior(items_pb)
#estimate error rates for each worker
for (worker in unique(error_rates$worker)){
error_rates=estimate_individual_error_rates(worker,categories,working,prior,error_rates,data)
}
}
#to remove if the category> binary class
result=ifelse(items_pb[,1]>0.5,TRUE,FALSE)
out=list(out = result, items_pb=items_pb, prior = prior, error_rates = error_rates, nb_iterations=nb_iterations)
return (out)
}
# initialize the object probability matrix [nb_objects x nb_categories]
# row idx = object idx, column idx= probability that the object belongs to the category idx
init_items=function(input){
nb_items=length(unique(input$item))
nb_labels=length(unique(input$label))
results= matrix(1/nb_labels,nrow=nb_items,ncol=nb_labels)
return (results)
}
# initialize the prior probability for each category
init_prior=function(input){
size=length(unique(input$label))
return (rep(1/size,size))
}
# initialize the error rates matrix
# row template: workerID, category_from, category_to, pb_error
# by default if from==to 0.9 else 0.1/(|Categories|-1)
init_error_rates=function(input){
nb_workers=length(unique(input$worker))
nb_labels=length(unique(input$label))
#create it
worker_id=rep(unique(input$worker), each = nb_labels*nb_labels)
to=rep(1:nb_labels, times= nb_workers*nb_labels)
from=rep(1:nb_labels, each=nb_labels, times= nb_workers)
error_rates=data.frame(worker=worker_id, from=from, to=to, error=0)
# fill it
error_rates$error[error_rates$to==error_rates$from]=0.9
error_rates$error[error_rates$to!=error_rates$from]=0.1/(nb_labels-1)
return(error_rates)
}
#init a temporary matrix from the data
#each row: itemIDX, workerIDX, labelIDX, true_labelIDX, error_rate
init_working=function(data){
nb_labels=length(unique(data$label))
data2=data[rep(1:nrow(data),each=nb_labels),]
data2$true_label=rep(1:nb_labels, times=nrow(data2)/nb_labels)
data2$evidence=1
return (data2)
}
#fill the temporary matrix with the new error_rates
fill_working=function(working,error_rates,categories){
for (worker in unique(working$worker)){
for (label in categories){
for (true_label in categories){
error=error_rates$error[error_rates$worker==worker & error_rates$from==true_label & error_rates$to==label]
working$evidence[working$worker==worker & working$label==label & working$true_label==true_label]=error
}
}
}
return (working)
}
# estimate the prior probability
estimate_prior=function(items_pb){
return (colSums(items_pb)/nrow(items_pb))
}
# equation Dawid and Skene 2.5 (vectorized)
estimate_items=function(working,prior){
# compute the product of the error rates of all the volunteers for all the items and the categorises
detailed_items=aggregate(evidence~item+true_label,working, prod)
items=cast(detailed_items,item~true_label)
items$item=NULL #remove
# (we add the prior)
for (i in 1:length(prior){
items[,i]=items[,i]*prior[i]
}
# we normalize
items=items/rowSums(items)
return(items)
}
# estimate the error rates for a given worker (worker)
estimate_individual_error_rates=function(worker,categories,working,prior,error_rates,data){
# for the items labeled by the worker,we recompute locally
# the items pb matrix without the influence of the worker
# and that we use as gold standard
items=data$item[data$worker==worker]
working=working[working$item %in% items & working$worker!=worker,]
items_worker=estimate_items(working,prior)
# + the actual label given by the worker
items_worker$to=data$label[data$worker==worker]
for (from in categories){
for (to in categories){
# we update the error rate
error_rates$error[error_rates$worker==worker & error_rates$from==from & error_rates$to==to]=sum(items_worker[items_worker$to==to,from])
}
# we then normalize
idx=error_rates$worker==worker & error_rates$from==from
error_rates$error[idx]=error_rates$error[idx]/sum(error_rates$error[idx])
}
return (error_rates)
}