-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmachine_replacement.py
136 lines (102 loc) · 4.56 KB
/
machine_replacement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import bayesian_irl
import mdp_worlds
import utils
import mdp
import numpy as np
import scipy
import random
import generate_efficient_frontier
import matplotlib.pyplot as plt
def generate_reward_sample():
#rewards for no-op are gamma distributed
r_noop = []
locs = 1/2
scales = [20, 40, 80,190]
for i in range(4):
r_noop.append(-np.random.gamma(locs, scales[i], 1)[0])
r_noop = np.array(r_noop)
#rewards for repair are -N(100,1) for all but last state where it is -N(130,20)
r_repair = -100 + -1 * np.random.randn(4)
return np.concatenate((r_noop, r_repair))
def generate_posterior_samples(num_samples):
print("samples")
all_samples = []
for i in range(num_samples):
r_sample = generate_reward_sample()
all_samples.append(r_sample)
print("mean of posterior from samples")
print(np.mean(all_samples, axis=0))
posterior = np.array(all_samples)
return posterior.transpose() #each column is a reward sample
if __name__=="__main__":
seed = 1234
np.random.seed(seed)
scipy.random.seed(seed)
random.seed(seed)
num_states = 4
num_samples = 2000
gamma = 0.95
alpha = 0.99
lamda = 0.9
posterior = generate_posterior_samples(num_samples)
r_sa = np.mean(posterior, axis=1)
init_distribution = np.ones(num_states)/num_states #uniform distribution
mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution)
print("---MDP solution for expectation---")
print("mean MDP reward", r_sa)
u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
print("mean policy from posterior")
utils.print_stochastic_policy_action_probs(u_sa, mdp_env)
print("MAP/Mean policy from posterior")
utils.print_policy_from_occupancies(u_sa, mdp_env)
print("rewards")
print(mdp_env.r_sa)
print("expected value = ", np.dot(u_sa, r_sa))
stoch_pi = utils.get_optimal_policy_from_usa(u_sa, mdp_env)
print("expected return", mdp.get_policy_expected_return(stoch_pi, mdp_env))
print("values", mdp.get_state_values(u_sa, mdp_env))
print('q-values', mdp.get_q_values(u_sa, mdp_env))
#run CVaR optimization, maybe just the robust version for now
u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states)
# print("solving for CVaR optimal policy")
posterior_probs = np.ones(num_samples) / num_samples #uniform dist since samples from MCMC
#generate efficient frontier
lambda_range = [0.0, 0.3, 0.5, 0.75, 0.95,0.99, 1.0]
#generate_efficient_frontier.calc_frontier(mdp_env, u_expert, posterior, posterior_probs, lambda_range, alpha, debug=False)
alpha = 0.99
print("calculating optimal policy for alpha = {} over lambda = {}".format(alpha, lambda_range))
cvar_rets = generate_efficient_frontier.calc_frontier(mdp_env, u_expert, posterior, posterior_probs, lambda_range, alpha, debug=False)
cvar_rets_array = np.array(cvar_rets)
plt.figure()
plt.plot(cvar_rets_array[:,0], cvar_rets_array[:,1], '-o')
#go through and label the points in the figure with the corresponding lambda values
unique_pts_lambdas = []
unique_pts = []
for i,pt in enumerate(cvar_rets_array):
unique = True
for upt in unique_pts:
if np.linalg.norm(upt - pt) < 0.00001:
unique = False
break
if unique:
unique_pts_lambdas.append((pt[0], pt[1], lambda_range[i]))
unique_pts.append(np.array(pt))
#calculate offset
offsetx = (np.max(cvar_rets_array[:,0]) - np.min(cvar_rets_array[:,0]))/30
offsety = (np.max(cvar_rets_array[:,1]) - np.min(cvar_rets_array[:,1]))/17
for i,pt in enumerate(unique_pts_lambdas):
if i in [0,1,2,4]:
plt.text(pt[0] - 6.2*offsetx, pt[1] , r"$\lambda = {}$".format(str(pt[2])), fontsize=19, fontweight='bold')
elif i in [3]:
plt.text(pt[0] - 6.2*offsetx, pt[1] - 1.2*offsety , r"$\lambda = {}$".format(str(pt[2])), fontsize=19, fontweight='bold')
elif i in [5]:
plt.text(pt[0] - 5.5*offsetx, pt[1] - 1.5*offsety, r"$\lambda = {}$".format(str(pt[2])), fontsize=19, fontweight='bold')
else:
plt.text(pt[0]-offsetx, pt[1] - 1.5*offsety, r"$\lambda = {}$".format(str(pt[2])), fontsize=19, fontweight='bold')
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel("Robustness (CVaR)", fontsize=20)
plt.ylabel("Expected Return", fontsize=20)
plt.tight_layout()
plt.savefig('./figs/machine_replacement/efficient_frontier_machine_replacement.png')
plt.show()