From 53870c95e0c64b9df9286bf6796cdb9a4361bc42 Mon Sep 17 00:00:00 2001 From: Limmen Date: Sat, 11 Jan 2025 08:29:08 +0100 Subject: [PATCH] cage-2 agg mdp --- .../cage2_aggregate_mdp.py | 480 ++++++++++++++++++ .../cyborg_scenario_two/eval_aggregate_mdp.py | 149 ++++++ .../cyborg_scenario_two/eval_on_base_env.py | 9 +- .../evaluate_on_wrapper_env.py | 1 - .../eval/cyborg_scenario_two/pomcp_eval.py | 146 +++--- .../src/csle_agents/constants/constants.py | 1 + .../gym_csle_cyborg/constants/constants.py | 1 + .../dao/cyborg_wrapper_state.py | 11 +- .../envs/cyborg_scenario_two_wrapper.py | 1 + 9 files changed, 726 insertions(+), 73 deletions(-) create mode 100644 examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py create mode 100644 examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py diff --git a/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py b/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py new file mode 100644 index 000000000..10ddd94a6 --- /dev/null +++ b/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py @@ -0,0 +1,480 @@ +import random +import numpy as np +from itertools import product +from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil +from gym_csle_cyborg.dao.cyborg_wrapper_state import CyborgWrapperState + + +class Cage2AggregateMDP: + """ + Aggregate MDP for Cage-2 with the bline attacker + """ + + @staticmethod + def get_aggregate_control(mu, aggregate_state, id_to_state): + """ + Gets the aggregate control prescribed by policy mu to the aggregate state + """ + a = int(np.argmax(mu[aggregate_state])) + decoys_per_host = CyborgEnvUtil.get_decoy_actions_per_host(scenario=2) + state = id_to_state[aggregate_state] + decoy_state = state[4:] + if decoy_state[a] >= len(decoys_per_host[Cage2AggregateMDP.aggregate_control_target_to_original_target(a)]): + possible_targets = [2, 6] + if state[3] != -1: + possible_targets.append(state[3]) + if state[3] in [3, 4]: + possible_targets.append(1) + else: + possible_targets.append(0) + for ph in possible_targets: + if decoy_state[ph] < \ + len(decoys_per_host[Cage2AggregateMDP.aggregate_control_target_to_original_target(ph)]): + a = ph + return Cage2AggregateMDP.aggregate_control_to_original_control()[a] + + @staticmethod + def get_aggregate_state(s: CyborgWrapperState, state_to_id): + """ + Converts the cyborg state into the aggregate state + """ + decoy_state = Cage2AggregateMDP.get_aggregate_decoy_state(s.get_decoy_state()) + red_state = s.red_agent_state + target = -1 + target_1 = -1 + target_2 = -1 + if red_state == 1: + target_1 = Cage2AggregateMDP.red_target_to_aggregate_target(s.red_agent_target) + target = target_1 + if target_1 in [3, 4]: + target_2 = 1 + else: + target_2 = 0 + elif red_state >= 1: + target_1 = Cage2AggregateMDP.red_target_to_aggregate_target(s.red_action_targets[1]) + if target_1 in [3, 4]: + target_2 = 1 + else: + target_2 = 0 + if red_state == 2: + target = target_1 + if red_state in [3, 4, 5]: + if target_1 in [3, 4]: + target = 1 + else: + target = 0 + if red_state in [6, 7, 8, 9]: + target = 2 + if red_state in [10, 11, 12, 13, 14]: + target = 7 + decoy_state_str = ",".join(list(map(lambda x: str(x), decoy_state))) + state_str = f"{red_state},{target},{target_1},{target_2},{decoy_state_str}" + return state_to_id[state_str] + + @staticmethod + def feasible_next_states(state_id, state_to_id, id_to_state, u): + """ + Calculates the feasible set of next aggregate states when taking a control in a given aggregate state + """ + B_LINE_AGENT_JUMPS = [0, 1, 2, 2, 2, 2, 5, 5, 5, 5, 9, 9, 9, 12, 13] + state = id_to_state[state_id] + red_state = state[0] + target = state[1] + target_1 = state[2] + target_2 = state[3] + decoy_state = state[4:] + feasible_next_red_states = [] + if red_state in [0, 1, 3, 4, 6, 7, 8, 10, 11, 13]: + feasible_next_red_states = [red_state + 1] + if red_state == 14: + feasible_next_red_states = [14] + if red_state == 2: + feasible_next_red_states = [red_state, red_state + 1] + if red_state in [5, 9, 12]: + feasible_next_red_states = [red_state + 1, B_LINE_AGENT_JUMPS[red_state]] + feasible_next_targets = [] + if target == -1 and red_state == 0: + feasible_next_targets = [3, 4, 5, 6] + if red_state in [1, 3, 4, 6, 7, 8, 10, 11, 13]: + feasible_next_targets = [target] + if target in [3, 4] and red_state == 2: + feasible_next_targets = [target] + [1] + elif target in [5, 6] and red_state == 2: + feasible_next_targets = [target] + [0] + elif target in [0, 1] and red_state == 5: + feasible_next_targets = [2, target_1] + elif target == 2 and red_state == 9: + feasible_next_targets = [7, target_2] + elif target == 7 and red_state == 12: + feasible_next_targets = [2, target] + next_decoy_state = decoy_state.copy() + if u == 0: + next_decoy_state[0] = min(next_decoy_state[0] + 1, 4) + elif u == 1: + next_decoy_state[1] = min(next_decoy_state[1] + 1, 1) + elif u == 2: + next_decoy_state[2] = min(next_decoy_state[2] + 1, 1) + elif u == 3: + next_decoy_state[3] = min(next_decoy_state[3] + 1, 4) + elif u == 4: + next_decoy_state[4] = min(next_decoy_state[4] + 1, 4) + elif u == 5: + next_decoy_state[5] = min(next_decoy_state[5] + 1, 2) + elif u == 6: + next_decoy_state[6] = min(next_decoy_state[6] + 1, 4) + feasible_next_target_1 = [] + feasible_next_target_2 = [] + if red_state == 0: + feasible_next_target_1.append(3) + feasible_next_target_1.append(4) + feasible_next_target_1.append(5) + feasible_next_target_1.append(6) + feasible_next_target_2.append(0) + feasible_next_target_2.append(1) + elif red_state == 1: + feasible_next_target_1 = [target] + if target in [3, 4]: + feasible_next_target_2 = [1] + else: + feasible_next_target_2 = [0] + else: + feasible_next_target_1 = [target_1] + feasible_next_target_2 = [target_2] + feasible_next_states = [] + + for feasible_target_1 in feasible_next_target_1: + for feasible_target_2 in feasible_next_target_2: + for feasible_red_state in feasible_next_red_states: + for feasible_target in feasible_next_targets: + if feasible_red_state == 1 and (feasible_target != feasible_target_1): + continue + if feasible_target_1 in [3, 4] and feasible_target_2 != 1: + continue + if feasible_target_1 in [5, 6] and feasible_target_2 != 0: + continue + if red_state == 5 and feasible_red_state == 2 and feasible_target != target_1: + continue + if red_state == 9 and feasible_red_state == 5 and feasible_target != target_2: + continue + if red_state == 12 and feasible_red_state == 9 and feasible_target != 2: + continue + if feasible_red_state == 6 and feasible_target == target_1: + continue + if feasible_red_state == 10 and feasible_target == target_2: + continue + if feasible_red_state == 13 and feasible_target == 2: + continue + if feasible_target != target and (feasible_red_state == red_state): + continue + if red_state in [2, 5, 9] and feasible_red_state != red_state and feasible_target == target: + continue + f_state = f"{feasible_red_state},{feasible_target},{feasible_target_1},{feasible_target_2}," \ + f"{','.join(list(map(lambda x: str(x), next_decoy_state)))}" + try: + feasible_next_states.append(state_to_id[f_state]) + except Exception as e: + print(type(e)) + print(f"curr state: {state}, next state: {f_state}") + print(feasible_next_target_1) + print(feasible_next_target_2) + import sys + sys.exit() + return feasible_next_states + + @staticmethod + def exploit_success_probability(target, decoy_state): + """ + Calculates the probability that an exploit against the given target is successful, given the decoy state + """ + if target == 0: + return [1.0, 0.25, 0.1238899, 0.0838379, 0.083196][decoy_state] + elif target == 1: + return [1.0, 0.250012][decoy_state] + elif target == 2: + return [1.0, 0.25029][decoy_state] + elif target == 3: + return [1.0, 0.87456, 0.832632, 0.813176, 0.799056][decoy_state] + elif target == 4: + return [1.0, 0.250698, 0.166932, 0.124348, 0.09942][decoy_state] + elif target == 5: + return [1.0, 1.0, 0.93785][decoy_state] + elif target == 6: + return 1 + elif target == 7: + return [1.0, 0.25038, 0.12526, 0.082999, 0.083086][decoy_state] + + @staticmethod + def transition_probability(state_to_id, id_to_state, x, x_prime, u): + """ + Calculates P(x_prime | x, u), where ,x_prime,x,u are aggregate states and controls + """ + feasible_next_states = Cage2AggregateMDP.feasible_next_states(state_id=x, state_to_id=state_to_id, + id_to_state=id_to_state, u=u) + if x_prime not in feasible_next_states: + return 0 + state = id_to_state[x] + state_prime = id_to_state[x_prime] + red_state = state[0] + + if red_state == 0: + return 0.25 + + if red_state == 14: + return 1 + if red_state in [1, 3, 4, 6, 7, 8, 10, 11, 13]: + return 1 + target = state[1] + next_decoy_state = state_prime[4:] + next_target = state_prime[1] + next_red_state = state_prime[0] + next_target_1 = state_prime[2] + next_target_2 = state_prime[3] + if next_target in [3, 4, 5, 6] and next_target_1 != next_target: + return 0 + if next_target in [0, 1] and next_target_2 != next_target: + return 0 + if target == 6: + ds = -1 + elif target == 7: + ds = next_decoy_state[6] + else: + ds = next_decoy_state[target] + exploit_success_prob = Cage2AggregateMDP.exploit_success_probability(target=target, decoy_state=ds) + if red_state == 12 and next_red_state == 13: + return exploit_success_prob + if red_state == 12 and next_red_state == 12: + return 1 - exploit_success_prob + if (next_red_state == red_state + 1) and next_target != target: + return exploit_success_prob + else: + return 1 - exploit_success_prob + + @staticmethod + def aggregate_control_to_original_control(): + """ + Returns a dict that maps an aggregate control to the original control space + """ + return {0: 27, 1: 28, 2: 29, 3: 30, 4: 31, 5: 32, 6: 35} + + @staticmethod + def get_aggregate_decoy_state(decoy_state): + """ + Converts a decoy state into an aggregate decoy state + """ + ds = [] + decoy_hosts = [1, 2, 3, 9, 10, 11, 12] + for dh in decoy_hosts: + ds.append(decoy_state[dh]) + return ds + + @staticmethod + def get_aggregate_decoy_state_space(): + """ + Gets the aggregate decoy state space + """ + values = [range(5), range(2), range(2), range(5), range(5), range(3), range(5)] + return list(product(*values)) + + @staticmethod + def red_target_to_aggregate_target(target): + """ + Converts a red target to the corresponding aggregate target + """ + red_target_to_target_idx = {0: -1, 1: 0, 2: 1, 3: 2, 4: 0, 5: 0, 6: 0, 7: 7, 8: -1, 9: 3, 10: 4, 11: 5, 12: 6} + return red_target_to_target_idx[target] + + @staticmethod + def aggregate_control_target_to_original_target(target): + """ + Converts a red target to the corresponding aggregate target + """ + aggregate_target_to_red_target = {0: 1, 1: 2, 2: 3, 3: 9, 4: 10, 5: 11, 6: 7} + return aggregate_target_to_red_target[target] + + @staticmethod + def X(): + """ + Aggregate state space + """ + decoy_states = Cage2AggregateMDP.get_aggregate_decoy_state_space() + state_to_id = {} + id_to_state = {} + X = [] + state_id = 0 + targets = [-1] + list(range(8)) + target_1s = [-1, 3, 4, 5, 6] + target_2s = [-1, 0, 1] + for red_state in range(15): + for target_1 in target_1s: + for target_2 in target_2s: + if target_1 in [3, 4] and target_2 != 1: + continue + if target_1 in [5, 6] and target_2 != 0: + continue + if target_2 != -1 and target_1 == -1: + continue + for target in targets: + if target == -1 and (target_1 != -1 or target_2 != -1): + continue + if target != -1 and (target_1 == -1 or target_2 == -1): + continue + if red_state in [1, 2] and target not in [3, 4, 5, 6]: + continue + if red_state in [3, 4, 5] and target not in [0, 1]: + continue + if red_state in [6, 7, 8, 9] and target != 2: + continue + if red_state in [10, 11, 12, 13] and target != 7: + continue + if target in [3, 4, 5, 6] and target_1 != target: + continue + if target in [0, 1] and target_2 != target: + continue + if target in [3, 4, 5, 6] and red_state not in [0, 1, 2]: + continue + if target in [0, 1] and red_state not in [3, 4, 5]: + continue + if target == 2 and red_state not in [6, 7, 8, 9]: + continue + if target == 7 and red_state not in [10, 11, 12, 13, 14]: + continue + if red_state == 0 and (target != -1 or target_1 != -1 or target_2 != -1): + continue + if red_state >= 1 and (target_1 == -1 or target == -1 or target_2 == -1): + continue + for decoy_state in decoy_states: + state_to_id[f"{red_state},{target},{target_1},{target_2}," \ + f"{','.join(list(map(lambda x: str(x), list(decoy_state))))}"] = state_id + id_to_state[state_id] = [red_state, target, target_1, target_2] + list(decoy_state) + X.append(state_id) + state_id += 1 + return X, state_to_id, id_to_state + + @staticmethod + def U(): + """ + Aggregate control space + """ + return [0, 1, 2, 3, 4, 5, 6] + + @staticmethod + def cost_function(x, id_to_state): + """ + Aggregate cost function + """ + red_state = id_to_state[x][0] + if red_state in [3, 4, 5]: + return 0.1 + elif red_state in [6, 7, 8, 9]: + return 1 + elif red_state in [10, 11, 12]: + return 2 + elif red_state in [13, 14]: + return 3 + return 0 + + @staticmethod + def vi(X, U, gamma, epsilon, verbose, state_to_id, id_to_state): + """ + Value iteration + """ + action_id_to_type_and_host, type_and_host_to_action_id \ + = CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True, + decoy_optimization=False) + J = np.zeros(len(X)) + iteration = 0 + while True: + delta = 0 + for x in X: + if x % 100000 == 0: + print(f"{x}/{len(X)}") + u_star, J_u_star = Cage2AggregateMDP.TJx(x=x, J=J, U=U, gamma=gamma, state_to_id=state_to_id, + id_to_state=id_to_state) + delta = max(delta, np.abs(J_u_star - J[x])) + J[x] = J_u_star + iteration += 1 + if verbose: + print(f"VI iteration: {iteration}, delta: {delta}, epsilon: {epsilon}") + ssx = [state_to_id[f"0,-1,-1,-1,0,0,0,0,0,0,0"], state_to_id[f"1,3,3,1,0,0,0,0,0,0,0"], + state_to_id[f"2,3,3,1,0,0,0,0,0,0,0"], + state_to_id[f"3,1,3,1,0,0,0,0,0,0,0"], state_to_id[f"4,1,3,1,0,0,0,0,0,0,0"], + state_to_id[f"5,1,3,1,0,0,0,0,0,0,0"], state_to_id[f"6,2,3,1,0,0,0,0,0,0,0"], + state_to_id[f"7,2,3,1,0,0,0,0,0,0,0"], state_to_id[f"8,2,3,1,0,0,0,0,0,0,0"], + state_to_id[f"9,2,3,1,0,0,0,0,0,0,0"], state_to_id[f"10,7,3,1,0,0,0,0,0,0,0"], + state_to_id[f"11,7,3,1,0,0,0,0,0,0,0"], state_to_id[f"12,7,3,1,0,0,0,0,0,0,0"], + state_to_id[f"13,7,3,1,0,0,0,0,0,0,0"], state_to_id[f"14,7,3,1,0,0,0,0,0,0,0"]] + for sx in ssx: + u = Cage2AggregateMDP.TJx(x=sx, J=J, U=U, gamma=gamma, state_to_id=state_to_id, + id_to_state=id_to_state)[0] + action = Cage2AggregateMDP.aggregate_control_to_original_control()[u] + sts = id_to_state[sx] + print(f"mu({sts})={action_id_to_type_and_host[action][1]}") + if delta < epsilon: + break + mu = Cage2AggregateMDP.policy(X=X, U=U, gamma=gamma, J=J, state_to_id=state_to_id, id_to_state=id_to_state) + return mu, J + + @staticmethod + def TJx(x, J, U, gamma, state_to_id, id_to_state): + """ + Implements the Bellman operator (TJ))(x) + """ + Q_x = np.zeros(len(U)) + for u in U: + feasible_x_prime = Cage2AggregateMDP.feasible_next_states(state_id=x, state_to_id=state_to_id, + id_to_state=id_to_state, u=u) + for x_prime in feasible_x_prime: + p = Cage2AggregateMDP.transition_probability(state_to_id=state_to_id, id_to_state=id_to_state, x=x, + x_prime=x_prime, u=u) + c = Cage2AggregateMDP.cost_function(x=x, id_to_state=id_to_state) + Q_x[u] += p * (c + gamma * J[x_prime]) + u_star = int(np.argmin(Q_x)) + return u_star, Q_x[u_star] + + @staticmethod + def policy(X, U, gamma, J, state_to_id, id_to_state): + """ + Constructs a policy based on J + """ + mu = np.zeros((len(X), len(U))) + for x in X: + mu[x][Cage2AggregateMDP.TJx(x=x, J=J, U=U, gamma=gamma, state_to_id=state_to_id, + id_to_state=id_to_state)[0]] = 1.0 + return mu + + @staticmethod + def run_vi(): + """ + Runs value iteration and saves the results to disk + """ + X, state_to_id, id_to_state = Cage2AggregateMDP.X() + U = Cage2AggregateMDP.U() + gamma = 0.99 + epsilon = 0.1 + mu, J = Cage2AggregateMDP.vi(X=X, U=U, gamma=gamma, epsilon=epsilon, verbose=True, state_to_id=state_to_id, + id_to_state=id_to_state) + np.savetxt("mu1.txt", mu) + np.savetxt("J1.txt", J) + + @staticmethod + def test(): + """ + Simulates N test trajectories of the aggregate MDP + """ + X, state_to_id, id_to_state = Cage2AggregateMDP.X() + U = Cage2AggregateMDP.U() + N = 1000 + for k in range(N): + x = state_to_id[f"0,-1,-1,0,0,0,0,0,0,0"] + while id_to_state[x][0] != 14: + u = random.choice(U) + feasible_states = Cage2AggregateMDP.feasible_next_states(state_id=x, state_to_id=state_to_id, + id_to_state=id_to_state, u=u) + probs = list(map(lambda x_prime: Cage2AggregateMDP.transition_probability( + state_to_id=state_to_id, id_to_state=id_to_state, x=x, x_prime=x_prime, u=u), feasible_states)) + print(f"u: {u}, x: {id_to_state[x]}") + x = np.random.choice(feasible_states, p=probs) + + +if __name__ == '__main__': + pass diff --git a/examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py b/examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py new file mode 100644 index 000000000..3697b3fd2 --- /dev/null +++ b/examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py @@ -0,0 +1,149 @@ +import numpy as np +import copy +from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper +from gym_csle_cyborg.dao.red_agent_type import RedAgentType +from gym_csle_cyborg.dao.csle_cyborg_wrapper_config import CSLECyborgWrapperConfig +from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil +from gym_csle_cyborg.dao.cyborg_wrapper_state import CyborgWrapperState +from cyborg_agg_mdp import Cage2AggregateMDP + + +# def monte_carlo_most_frequent(elements, num_samples): +# if not elements: +# raise ValueError("The input list is empty.") +# +# # Perform random sampling +# samples = [random.choice(elements) for _ in range(num_samples)] +# +# # Count occurrences of sampled elements +# counter = Counter(samples) +# +# # Find the most common element +# most_frequent_element = counter.most_common(1)[0][0] +# return most_frequent_element + +# def particle_filter(particles, max_num_particles, train_env, action, obs): +# new_particles = [] +# while len(particles) < max_num_particles: +# x = random.choice(particles) +# train_env.set_state(state=x) +# _, r, _, _, info = train_env.step(action) +# s_prime = info["s"] +# o = info["o"] +# if o == obs: +# new_particles.append(s_prime) +# return new_particles + +def restore_policy(s: CyborgWrapperState): + a = -1 + if s.s[1][2] == 2: + a = 0 # Ent0 + if s.s[2][2] == 2: + a = 1 # Ent 1 + if s.s[3][2] == 2: + a = 2 # Ent 2 + if s.s[7][2] == 2: + a = 3 # Opserver + + if s.s[1][2] == 1: + a = 8 # Ent0 + if s.s[2][2] == 1: + a = 9 # Ent1 + if s.s[3][2] == 1: + a = 10 # Ent2 + if s.s[3][2] == 1: + a = 11 # Opserver + if s.s[9][2] == 1: + a = 22 # User1 + if s.s[10][2] == 1: + a = 23 # User2 + if s.s[11][2] == 1: + a = 24 # User3 + if s.s[12][2] == 1: + a = 25 # User4 + return a + +def rollout(s: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, J, state_to_id, mu, l, gamma=0.99): + # U = [0, 1, 2, 3, 8, 9, 10, 11, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 35] + U = [27, 28, 29, 30, 31, 32, 35] + U = [27, 28, 29, 30, 31, 32] + Q_n = [] + for u in U: + u_r = restore_policy(s=s) + if u_r != -1: + o, c, done, _, info = train_env.step(action=u_r) + s_prime = info["s"] + aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s_prime, state_to_id=state_to_id) + if l == 1: + return u_r, J[aggregate_state] + else: + returns = [] + for i in range(2): + returns.append(rollout(copy.deepcopy(s_prime), train_env=train_env, J=J, state_to_id=state_to_id, mu=mu, l=l-1)[1]) + cost_to_go = np.mean(returns) + else: + train_env.set_state(s) + o, c, done, _, info = train_env.step(action=u) + s_prime = info["s"] + aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s_prime, state_to_id=state_to_id) + if l == 1: + cost_to_go = J[aggregate_state] + else: + returns = [] + for i in range(2): + returns.append(rollout(copy.deepcopy(s_prime), train_env=train_env, J=J, state_to_id=state_to_id, mu=mu, l=l-1)[1]) + cost_to_go = np.mean(returns) + Q_n.append(-c + gamma*cost_to_go) + # print(Q_n) + # print(U[int(np.argmin(Q_n))]) + u_star = int(np.argmin(Q_n)) + return U[u_star], Q_n[u_star] + + +if __name__ == '__main__': + config = CSLECyborgWrapperConfig(maximum_steps=100, gym_env_name="", + save_trace=False, reward_shaping=False, scenario=2, + red_agent_type=RedAgentType.B_LINE_AGENT) + env = CyborgScenarioTwoWrapper(config=config) + train_env = CyborgScenarioTwoWrapper(config=config) + action_id_to_type_and_host, type_and_host_to_action_id \ + = CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True, decoy_optimization=False) + N = 10000 + max_env_steps = 100 + mu = np.loadtxt("./mu1.txt") + J = np.loadtxt("./J1.txt") + X, state_to_id, id_to_state = Cage2AggregateMDP.X() + gamma = 0.99 + l = 3 + returns = [] + for i in range(N): + print(f"{i}/{N}") + done = False + _, info = env.reset() + s = info["s"] + t = 1 + R = 0 + particles = env.initial_particles + while not done and t < max_env_steps: + # monte_carlo_state = monte_carlo_most_frequent(elements=particles, num_samples=100) + aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s, state_to_id=state_to_id) + a = -1 + a = restore_policy(s=s) + + if t <= 1: + a = 31 + if a == -1: + a = Cage2AggregateMDP.get_aggregate_control(mu=mu, aggregate_state=aggregate_state, + id_to_state=id_to_state) + # print(f"base: {a}") + a = rollout(s=s, state_to_id=state_to_id, train_env=train_env, J=J, mu=mu, gamma=gamma, l=l)[0] + # print(f"rollout: {a}") + o, r, done, _, info = env.step(a) + # particles = particle_filter(particles=particles, max_num_particles=1000, + # train_env=train_env, action=a, obs=o) + s = info["s"] + t+= 1 + R+= r + # print(f"t:{t}, r: {r}, a: {action_id_to_type_and_host[a]}, R: {R}, aggstate: {id_to_state[aggregate_state]}") + returns.append(R) + print(np.mean(returns)) \ No newline at end of file diff --git a/examples/eval/cyborg_scenario_two/eval_on_base_env.py b/examples/eval/cyborg_scenario_two/eval_on_base_env.py index 754683087..0cd7a1d53 100644 --- a/examples/eval/cyborg_scenario_two/eval_on_base_env.py +++ b/examples/eval/cyborg_scenario_two/eval_on_base_env.py @@ -20,17 +20,12 @@ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - # print(csle_cyborg_env.action_id_to_type_and_host) - # import sys - # sys.exit(0) - # print("Starting policy evaluation") for i in range(num_evaluations): o, _ = csle_cyborg_env.reset() R = 0 t = 0 while t < max_horizon: - # a = ppo_policy.action(o=o) - a = 4 + a = ppo_policy.action(o=o) o, r, done, _, info = csle_cyborg_env.step(a) table = csle_cyborg_env.get_true_table() print(table) @@ -38,4 +33,4 @@ R += r t += 1 returns.append(R) - # print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}") + print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}") diff --git a/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py b/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py index da39cea81..3bfc21d4b 100644 --- a/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py +++ b/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py @@ -19,7 +19,6 @@ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - print("Starting policy evaluation") for i in range(num_evaluations): o, _ = env.reset() R = 0 diff --git a/examples/eval/cyborg_scenario_two/pomcp_eval.py b/examples/eval/cyborg_scenario_two/pomcp_eval.py index e0e60c711..109ed3c9d 100644 --- a/examples/eval/cyborg_scenario_two/pomcp_eval.py +++ b/examples/eval/cyborg_scenario_two/pomcp_eval.py @@ -1,94 +1,112 @@ +from typing import List import numpy as np -import torch -import random -import json -import io -from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig -from gym_csle_cyborg.dao.red_agent_type import RedAgentType -from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender +import time from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper from gym_csle_cyborg.dao.csle_cyborg_wrapper_config import CSLECyborgWrapperConfig from csle_agents.agents.pomcp.pomcp import POMCP from csle_agents.agents.pomcp.pomcp_acquisition_function_type import POMCPAcquisitionFunctionType import csle_agents.constants.constants as agents_constants from csle_common.logging.log import Logger +from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil +from gym_csle_cyborg.dao.red_agent_type import RedAgentType + + +def heuristic_value(o: List[List[int]]) -> float: + """ + A heuristic value function + + :param o: the observation vector + :return: the value + """ + host_costs = CyborgEnvUtil.get_host_compromised_costs() + val = 0 + for i in range(len(o)): + if o[i][2] > 0: + val += host_costs[i] + return val + if __name__ == '__main__': - # ppo_policy = PPOPolicy(model=None, simulation_name="", save_path="") - config = CSLECyborgConfig( - gym_env_name="csle-cyborg-scenario-two-v1", scenario=2, baseline_red_agents=[RedAgentType.B_LINE_AGENT], - maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=True, decoy_state=True, - scanned_state=True, decoy_optimization=False, cache_visited_states=False) - eval_env = CyborgScenarioTwoDefender(config=config) - config = CSLECyborgWrapperConfig(maximum_steps=100, gym_env_name="", - save_trace=False, reward_shaping=False, scenario=2) + config = CSLECyborgWrapperConfig( + gym_env_name="csle-cyborg-scenario-two-wrapper-v1", maximum_steps=100, save_trace=False, scenario=2, + reward_shaping=True, red_agent_type=RedAgentType.B_LINE_AGENT) + eval_env = CyborgScenarioTwoWrapper(config=config) train_env = CyborgScenarioTwoWrapper(config=config) + action_id_to_type_and_host, type_and_host_to_action_id \ + = CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True, + decoy_optimization=False) - num_evaluations = 10 - max_horizon = 100 - returns = [] - seed = 215125 - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) + N = 5000 + rollout_policy = lambda x, deterministic: 35 + value_function = heuristic_value A = train_env.get_action_space() - gamma = 0.75 - c = 1 - print("Starting policy evaluation") - for i in range(num_evaluations): + gamma = 0.99 + reinvigoration = False + reinvigorated_particles_ratio = 0.0 + initial_particles = train_env.initial_particles + planning_time = 3.75 + prune_action_space = False + max_particles = 1000 + max_planning_depth = 50 + max_rollout_depth = 4 + c = 0.5 + c2 = 15000 + use_rollout_policy = False + prior_weight = 5 + prior_confidence = 0 + acquisition_function_type = POMCPAcquisitionFunctionType.UCB + log_steps_frequency = 1 + max_negative_samples = 20 + default_node_value = 0 + verbose = False + eval_batch_size = 100 + max_env_steps = 100 + prune_size = 3 + start = time.time() + + # Run N episodes + returns = [] + for i in range(N): + done = False + action_sequence = [] _, info = eval_env.reset() s = info[agents_constants.COMMON.STATE] train_env.reset() - initial_particles = train_env.initial_particles - max_particles = 1000 - planning_time = 60 - value_function = lambda x: 0 - reinvigoration = False - rollout_policy = False - verbose = False - default_node_value = 0 - prior_weight = 1 - acquisition_function_type = POMCPAcquisitionFunctionType.UCB - use_rollout_policy = False - reinvigorated_particles_ratio = False - prune_action_space = False - prune_size = 3 - prior_confidence = 0 pomcp = POMCP(A=A, gamma=gamma, env=train_env, c=c, initial_particles=initial_particles, planning_time=planning_time, max_particles=max_particles, rollout_policy=rollout_policy, value_function=value_function, reinvigoration=reinvigoration, verbose=verbose, default_node_value=default_node_value, prior_weight=prior_weight, - acquisition_function_type=acquisition_function_type, c2=1500, + acquisition_function_type=acquisition_function_type, c2=c2, use_rollout_policy=use_rollout_policy, prior_confidence=prior_confidence, reinvigorated_particles_ratio=reinvigorated_particles_ratio, prune_action_space=prune_action_space, prune_size=prune_size) - rollout_depth = 4 - planning_depth = 100 R = 0 - t = 0 - action_sequence = [] - while t < max_horizon: - pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth) + t = 1 + + # Run episode + while not done and t <= max_env_steps: + rollout_depth = max_rollout_depth + planning_depth = max_planning_depth + pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth, t=t) action = pomcp.get_action() - o, r, done, _, info = eval_env.step(action) + o, _, done, _, info = eval_env.step(action) + r = info[agents_constants.COMMON.REWARD] action_sequence.append(action) s_prime = info[agents_constants.COMMON.STATE] obs_id = info[agents_constants.COMMON.OBSERVATION] - pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id) - print(eval_env.get_true_table()) - print(eval_env.get_table()) + pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id, t=t) R += r t += 1 - Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action}, r: {r}, o: {obs_id}, " - f"s_prime: {s_prime}," - f", action sequence: {action_sequence}, R: {R}") + if t % log_steps_frequency == 0: + Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action_id_to_type_and_host[action]}, r: {r}, " + f"action sequence: {action_sequence}, R: {round(R, 2)}") + + # Logging returns.append(R) - print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}") - results = {} - results["seed"] = seed - results["training_time"] = 0 - results["returns"] = returns - results["planning_time"] = planning_time - json_str = json.dumps(results, indent=4, sort_keys=True) - with io.open(f"/Users/kim/pomcp_{0}_60s.json", 'w', encoding='utf-8') as f: - f.write(json_str) + progress = round((i + 1) / N, 2) + time_elapsed_minutes = round((time.time() - start) / 60, 3) + Logger.__call__().get_logger().info( + f"[POMCP] episode: {i}, J:{R}, " + f"J_avg: {np.mean(returns)}, " + f"progress: {round(progress * 100, 2)}%, " + f"runtime: {time_elapsed_minutes} min") diff --git a/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py b/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py index 45589c5b3..d604f6917 100644 --- a/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py +++ b/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py @@ -57,6 +57,7 @@ class COMMON: EVALUATE_WITH_DISCOUNT = "evaluate_with_discount" STATE = "s" OBSERVATION = "o" + REWARD = "r" class PPO: diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py index 5f6c76b35..037354697 100644 --- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py +++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py @@ -31,6 +31,7 @@ class ENV_METRICS: ATTACKER_ACTION = "a2" OBSERVATION = "o" OBSERVATION_VECTOR = "obs_vec" + REWARD = "r" OBSERVATION_ID = "obs_id" TIME_STEP = "t" AVERAGE_UPPER_BOUND_RETURN = "average_upper_bound_return" diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py index 3f412f2f4..d73a65f79 100644 --- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py +++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py @@ -1,5 +1,6 @@ from typing import List, Dict, Any, Union from csle_base.json_serializable import JSONSerializable +import gym_csle_cyborg.constants.constants as env_constants class CyborgWrapperState(JSONSerializable): @@ -75,7 +76,7 @@ def from_dict(d: Dict[str, Any]) -> "CyborgWrapperState": obj = CyborgWrapperState( s=d["s"], scan_state=d["scan_state"], op_server_restored=d["op_server_restored"], obs=d["obs"], red_action_targets=d["red_action_targets"], - privilege_escalation_detected=d["privilege_escalation_deteceted"], red_agent_state=d["red_agent_state"], + privilege_escalation_detected=d["privilege_escalation_detected"], red_agent_state=d["red_agent_state"], red_agent_target=d["red_agent_target"], attacker_observed_decoy=d["attacker_observed_decoy"], detected=d["detected"], malware_state=d["malware_state"], ssh_access=d["ssh_access"], escalated=d["escalated"], exploited=d["exploited"], bline_base_jump=d["bline_base_jump"], @@ -133,3 +134,11 @@ def from_json_file(json_file_path: str) -> "CyborgWrapperState": json_str = f.read() dto = CyborgWrapperState.from_json_str(json_str=json_str) return dto + + def get_decoy_state(self): + """ + Extracts the decoy state + + :return: a list with the decoy state of each host + """ + return [host_state[env_constants.CYBORG.HOST_STATE_DECOY_IDX] for host_state in self.s] \ No newline at end of file diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py index da7f256ff..674598ff8 100644 --- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py +++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py @@ -410,6 +410,7 @@ def step(self, action: int) -> Tuple[npt.NDArray[Any], float, bool, bool, Dict[s info[env_constants.ENV_METRICS.OBSERVATION] = CyborgEnvUtil.state_vector_to_state_id( state_vector=obs, observation=True) info[env_constants.ENV_METRICS.OBSERVATION_VECTOR] = obs + info[env_constants.ENV_METRICS.REWARD] = r self.scan_state = copy.deepcopy(scan_state) self.s = s_prime self.last_obs = copy.deepcopy(obs)