Skip to content

Commit

Permalink
aggregate cage-2 mdp
Browse files Browse the repository at this point in the history
  • Loading branch information
Limmen committed Jan 11, 2025
1 parent 325f4d9 commit 68531fb
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 868 deletions.
359 changes: 191 additions & 168 deletions examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py

Large diffs are not rendered by default.

136 changes: 86 additions & 50 deletions examples/eval/cyborg_scenario_two/eval_aggregate_mdp_on_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Dict, Tuple
import random
import numpy as np
import copy
import math
from collections import Counter
from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
from gym_csle_cyborg.dao.red_agent_type import RedAgentType
Expand All @@ -12,24 +11,36 @@
import csle_agents.constants.constants as agents_constants


def monte_carlo_most_frequent_particle(particles, N):
def monte_carlo_most_frequent_particle(particles: List[CyborgWrapperState], N: int) -> CyborgWrapperState:
"""
Samples N particles and returns the most frequently sampled particle
:param particles: the list of particles
:param N: the number of samples
:return: the most frequently sampled particle
"""
samples = [random.choice(particles) for _ in range(N)]
counter = Counter(samples)
most_frequent_particle = counter.most_common(1)[0][0]
return most_frequent_particle


def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
def particle_filter(particles: List[CyborgWrapperState], max_num_particles: int, train_env: CyborgScenarioTwoWrapper,
u: int, obs: int, x_s: CyborgWrapperState) -> List[CyborgWrapperState]:
"""
Implements a particle filter
:param particles: the list of particles
:param max_num_particles: the maximum number of particles
:param train_env: the environment used for sampling
:param u: the latest control
:param obs: the latest observation
:param x_s: the true cyborg state
:return: the list of updated particles
"""
new_particles = []
failed_samples = 0
while len(new_particles) < max_num_particles:
# print(f"{len(new_particles)}/{max_num_particles}")
x = random.choice(particles)
train_env.set_state(state=x)
_, _, _, _, info = train_env.step(u)
Expand All @@ -46,74 +57,96 @@ def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
return new_particles


def restore_policy(x: CyborgWrapperState):
def restore_policy(x: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, particles: List[CyborgWrapperState]) \
-> int:
"""
Implements a heuristic restore policy for Cage2
:param x: the certainty-equivalence state
:param train_env: the environment used for simulation
:param particles: the current list of particles
:return: the control
"""
u = -1
if x.s[1][2] == 2:
u = 0 # Ent0
if x.s[2][2] == 2:
u = 1 # Ent 1
if x.s[3][2] == 2:
u = 2 # Ent 2
if x.s[7][2] == 2:
u = 3 # Opserver

if x.s[1][2] == 1:
u = 8 # Ent0
if x.s[2][2] == 1:
u = 9 # Ent1
if x.s[3][2] == 1:
u = 10 # Ent2
if x.s[3][2] == 1:
u = 11 # Opserver
if x.s[9][2] == 1:
u = 22 # User1
if x.s[10][2] == 1:
u = 23 # User2
if x.s[11][2] == 1:
u = 24 # User3
if x.s[12][2] == 1:
u = 25 # User4
restore_actions = [0, 1, 2, 3]
remove_actions = [8, 9, 10, 11, 22, 23, 24, 25]
remove_hosts = [1, 2, 3, 7, 9, 10, 11, 12]
restore_hosts = [1, 2, 3, 7]
outcomes = {}
for h in remove_hosts:
outcomes[h] = []
for i, host in enumerate(remove_hosts):
for p in particles:
if p.s[host][2] == 1:
train_env.set_state(p)
train_env.step(action=remove_actions[i])
if train_env.s[host][2] == 0:
outcomes[host].append(1)
else:
outcomes[host].append(0)
for i, h in enumerate(remove_hosts):
if len(outcomes[h]) > 0:
remove_p = np.mean(outcomes[h])
if remove_p >= 0.9:
return remove_actions[i]
for i, host in enumerate(restore_hosts):
if x.s[host][2] > 0:
return restore_actions[i]
return u


def rollout_policy(x: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, J, state_to_id, mu, l, id_to_state,
gamma=0.99, mc_samples=10):
def rollout_policy(train_env: CyborgScenarioTwoWrapper, J: List[float], state_to_id: Dict[str, int],
mu: List[List[float]], l: int, id_to_state: Dict[int, List[int]],
particles: List[CyborgWrapperState], gamma=0.99, mc_samples=10) -> Tuple[int, float]:
"""
A rollout policy for cage-2
:param train_env: the environment to use for sampling
:param J: the cost-to-go function of the base policy
:param state_to_id: the aggreate state to aggregate state id map
:param mu: the base policy
:param l: the lookahead horizon
:param id_to_state: the aggregate state id to aggregate state map
:param particles: the current particle state
:param gamma: the discount factor
:param mc_samples: the number of Monte-Carlo samples to use
:return: the next control and its estimated value
"""
U = [27, 28, 29, 30, 31, 32, 35]
Q_n = []
u_r = restore_policy(x=x)
for u in U:
returns = []
for i in range(mc_samples):
train_env.set_state(x)
particle = random.choice(particles)
train_env.set_state(particle)
_, _, _, _, info = train_env.step(action=u)
x_prime = info[agents_constants.COMMON.STATE]
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=x_prime, state_to_id=state_to_id)
# c = -info[agents_constants.COMMON.REWARD]
c = Cage2AggregateMDP.cost_function(x=aggregate_state, u=U.index(u), id_to_state=id_to_state)
c = -info[agents_constants.COMMON.REWARD]
if l == 1:
returns.append(c + gamma * J[aggregate_state])
else:
returns.append(c + gamma * rollout_policy(copy.deepcopy(x_prime), train_env=train_env, J=J,
returns.append(c + gamma * rollout_policy(train_env=train_env, J=J,
state_to_id=state_to_id, id_to_state=id_to_state,
mu=mu, l=l - 1)[1])
Q_n.append(np.mean(returns))
u_star = int(np.argmin(Q_n))
J_star = Q_n[u_star]
J_star = float(Q_n[u_star])
u_star = U[u_star]
u_r = restore_policy(x=x, train_env=train_env, particles=particles)
if u_r != -1:
u_star = u_r
return u_star, J_star


def base_policy(x, mu, id_to_state):
def base_policy(x: CyborgWrapperState, mu: List[List[float]], id_to_state: Dict[int, List[int]]) -> int:
"""
Implements the base policy mu
:param x: the current state id
:param mu: the base policy
:param id_to_state: the aggregate state id to aggregate state map
:return: the next control
"""
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)
return Cage2AggregateMDP.get_aggregate_control(mu=mu, aggregate_state=aggregate_state, id_to_state=id_to_state)
Expand All @@ -130,8 +163,8 @@ def base_policy(x, mu, id_to_state):
decoy_optimization=False)
N = 10000
max_env_steps = 100
mu = np.loadtxt("./mu.txt")
J = np.loadtxt("./J.txt")
mu = np.loadtxt("test/mu2.txt")
J = np.loadtxt("test/J2.txt")
X, state_to_id, id_to_state = Cage2AggregateMDP.X()
gamma = 0.99
l = 1
Expand All @@ -145,21 +178,24 @@ def base_policy(x, mu, id_to_state):
particles = env.initial_particles
while not done and t < max_env_steps:
monte_carlo_state = monte_carlo_most_frequent_particle(particles=particles, N=100)
# u = restore_policy(x=x)
u = restore_policy(x=monte_carlo_state)
u = restore_policy(x=monte_carlo_state, train_env=train_env, particles=particles)
if t <= 2:
u = 31
if u == -1:
# u = base_policy(x=monte_carlo_state, mu=mu, id_to_state=id_to_state)
# u = base_policy(x=x, mu=mu, id_to_state=id_to_state)
u = rollout_policy(x=x, state_to_id=state_to_id, id_to_state=id_to_state,
train_env=train_env, J=J, mu=mu, gamma=gamma, l=l)[0]
u = rollout_policy(state_to_id=state_to_id, id_to_state=id_to_state, train_env=train_env, J=J, mu=mu,
gamma=gamma, l=l, particles=particles, mc_samples=20)[0]
_, _, _, _, info = env.step(u)
particles = particle_filter(particles=particles, max_num_particles=50,
train_env=train_env, u=u, obs=info[agents_constants.COMMON.OBSERVATION],
x_s=info[agents_constants.COMMON.STATE])
c = -info[agents_constants.COMMON.REWARD]
C += math.pow(gamma, t - 1) * c
print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
f"aggstate: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
C += c
# aggstate = id_to_state[Cage2AggregateMDP.get_aggregate_state(s=monte_carlo_state,
# state_to_id=state_to_id)]
# print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
# f"aggstate: {aggstate},"
# f"true state: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
x = info[agents_constants.COMMON.STATE]
t += 1
returns.append(C)
Expand Down
3 changes: 0 additions & 3 deletions examples/eval/cyborg_scenario_two/test/.gitignore

This file was deleted.

Loading

0 comments on commit 68531fb

Please sign in to comment.