-
Notifications
You must be signed in to change notification settings - Fork 0
/
synthetic_data_experiment_2.py
204 lines (161 loc) · 6.39 KB
/
synthetic_data_experiment_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 27 22:55:36 2020
@author: dama-f
"""
##############################################################################
# This experiment is to learn PHMC-LAR model on unreliable labbeled data.
# Unreliablility level is controlled by parameter rho.
# rho is the mean of a Beta law that draws the probability for a label to be
# wrong.
# This is an supervised learning problem.
#
# This script take three input parameters
# * train_data_file This is a pickle serialization file that contains the
# training data.
# * output_dir The name of the directory in which the trained model has to be
# saved.
# * rho The level of unreliability upon labels.
#
##############################################################################
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), \
"src")))
import time
import pickle
import numpy as np
from scipy.stats import beta, bernoulli
from supervised_learning import learning
import warnings
warnings.filterwarnings("ignore")
## @fn
# @brief Sample non-reliable states from the given data set of N sequences.
# For each sequence, the true states are replaced by wrongs ones with
# probabilities drawn from a Beta(rho, sigma) law. Each wrong state is
# uniformly chosen from the other states execpt the true one.
#
# @param data_S List of N states sequences, each is a 1xT arrays.
# @param nb_regimes The number of classes with data.
# @param rho Unreliability level expressed in percentage.
# This is the mean of Beta distribution.
# @param sigma Unreliability level standard deviation.
# This is the standard deviation of Beta distribution.
#
# @warning: error occurrs when rho > 95
#
# @return List of N of 1xT arrays
#
def create_unreliable_labels(data_S, nb_regimes, rho, sigma):
#----no labelling error
if(rho == 0):
return data_S
#----labelling error
#nb sequence
N = len(data_S)
#sequence length
T = data_S[0].shape[1]
#output
res = []
#parameters of Beta distribution that mean and variance equal rho and
#sigma^2
rho = rho / 100.0
a = rho * (rho - rho**2 - sigma**2) / sigma**2
b = (1 - rho) * (rho - rho**2 - sigma**2) / sigma**2
for i in range(N):
res.append((-2)*np.ones(dtype=np.int32, shape=(1,T)))
for t in range(T):
#labelling error
error_prob = beta.rvs(a, b)
#this is equals 1 if labelling error occurs, 0 otherwise
error = bernoulli.rvs(error_prob)
if(error == 1):
wrong_states = [ s for s in range(nb_regimes) \
if s != data_S[i][0, t] ]
#true label is replaced by a random label
res[i][0, t] = np.random.choice(wrong_states)
elif(error == 0):
res[i][0, t] = data_S[i][0, t]
else:
print("ERROR: create unreliable labels, error = {}".format(error))
sys.exit(1)
return res
#=====================================Begin script
#starting time
start_time = time.time()
#----three command line argument are required
if(len(sys.argv) != 4):
print("ERROR: script unreliable_labels.py takes 3 arguments !")
print("Usage: ./synthetic_data_experiment_2.py train_data_file output_dir rho")
sys.exit(1)
#----unreliability level of labels, given in percentage
#mean of beta law, parameter given in command line
rho = float(sys.argv[3])
if(rho >= 100 or rho < 0):
print("ERROR: script synthetic_data_experiment_2.py: 0 <= rho < 100 !")
sys.exit(1)
#standard deviation of beta law, fixed.
#This value has been used by Ramesso and Denoeux
sigma = 0.2
#----hyper-parameters setting
X_order = 2
nb_regimes = 4
innovation = "gaussian"
#----training data loading
infile = open(sys.argv[1],'rb')
data_set = pickle.load(infile)
infile.close()
#time series and associated initial values
data_X = data_set[1]
initial_values = data_set[2]
#create unreliable labels
data_S = create_unreliable_labels(data_set[0], nb_regimes, rho, sigma)
#----LOG-INFO
print("**********************************************************************")
print("S = {}, T = {}".format(len(data_S), data_S[0].shape[1]))
print("X_order = {}, nb_regimes = {}, innovation = {}".format(X_order, \
nb_regimes, innovation))
print("train_data_file = {}, output_dir = {}, rho = {}".format(sys.argv[1], \
sys.argv[2], sys.argv[3]))
output_file = os.path.join(sys.argv[2], os.path.basename(sys.argv[1])) + \
"_unreliable-label_rho=" + str(rho)
print("Trained model has been saved within file {}".format(output_file))
#
print("***********************Labelling errors*******************************")
T = data_S[0].shape[1]
S = len(data_S)
error_rate = 0
for s in range(S):
error_rate = error_rate + np.sum(data_set[0][s][0,:] != data_S[s][0,:])
print("Labelling error rate = {}".format(error_rate / (T*S)))
#----learning: without initialization procedure
"""
model_output = learning (X_order, nb_regimes, data_X, initial_values, \
data_S, innovation)
"""
#eviquently, we can use EM_learning which will converge in one iteration.
from EM_learning import hmc_lar_parameter_learning
model_output = hmc_lar_parameter_learning (X_order, nb_regimes, data_X, \
initial_values, data_S, innovation)
#----save model_output
outfile = open(output_file, 'wb')
pickle.dump(model_output, outfile)
outfile.close()
#----running time estimation ends
duration = time.time() - start_time
print("======================================================================")
print("#learningTime: algorithm lastes {} minutes".format(duration/60))
print("======================================================================")
#----learnt parameters
print("---------------------psi-------------------------")
print(model_output[-1])
print("------------------AR process----------------------")
print("#total_log_ll= ", model_output[0])
print("ar_coefficients=", model_output[5])
print("intercept=", model_output[7])
print("sigma=", model_output[6])
print()
print("------------------Markov chain----------------------")
print("Pi=", model_output[2])
print("A=", model_output[1])