From 53870c95e0c64b9df9286bf6796cdb9a4361bc42 Mon Sep 17 00:00:00 2001
From: Limmen <kimham@kth.se>
Date: Sat, 11 Jan 2025 08:29:08 +0100
Subject: [PATCH] cage-2 agg mdp

---
 .../cage2_aggregate_mdp.py                    | 480 ++++++++++++++++++
 .../cyborg_scenario_two/eval_aggregate_mdp.py | 149 ++++++
 .../cyborg_scenario_two/eval_on_base_env.py   |   9 +-
 .../evaluate_on_wrapper_env.py                |   1 -
 .../eval/cyborg_scenario_two/pomcp_eval.py    | 146 +++---
 .../src/csle_agents/constants/constants.py    |   1 +
 .../gym_csle_cyborg/constants/constants.py    |   1 +
 .../dao/cyborg_wrapper_state.py               |  11 +-
 .../envs/cyborg_scenario_two_wrapper.py       |   1 +
 9 files changed, 726 insertions(+), 73 deletions(-)
 create mode 100644 examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py
 create mode 100644 examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py

diff --git a/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py b/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py
new file mode 100644
index 000000000..10ddd94a6
--- /dev/null
+++ b/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py
@@ -0,0 +1,480 @@
+import random
+import numpy as np
+from itertools import product
+from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
+from gym_csle_cyborg.dao.cyborg_wrapper_state import CyborgWrapperState
+
+
+class Cage2AggregateMDP:
+    """
+    Aggregate MDP for Cage-2 with the bline attacker
+    """
+
+    @staticmethod
+    def get_aggregate_control(mu, aggregate_state, id_to_state):
+        """
+        Gets the aggregate control prescribed by policy mu to the aggregate state
+        """
+        a = int(np.argmax(mu[aggregate_state]))
+        decoys_per_host = CyborgEnvUtil.get_decoy_actions_per_host(scenario=2)
+        state = id_to_state[aggregate_state]
+        decoy_state = state[4:]
+        if decoy_state[a] >= len(decoys_per_host[Cage2AggregateMDP.aggregate_control_target_to_original_target(a)]):
+            possible_targets = [2, 6]
+            if state[3] != -1:
+                possible_targets.append(state[3])
+                if state[3] in [3, 4]:
+                    possible_targets.append(1)
+                else:
+                    possible_targets.append(0)
+            for ph in possible_targets:
+                if decoy_state[ph] < \
+                        len(decoys_per_host[Cage2AggregateMDP.aggregate_control_target_to_original_target(ph)]):
+                    a = ph
+        return Cage2AggregateMDP.aggregate_control_to_original_control()[a]
+
+    @staticmethod
+    def get_aggregate_state(s: CyborgWrapperState, state_to_id):
+        """
+        Converts the cyborg state into the aggregate state
+        """
+        decoy_state = Cage2AggregateMDP.get_aggregate_decoy_state(s.get_decoy_state())
+        red_state = s.red_agent_state
+        target = -1
+        target_1 = -1
+        target_2 = -1
+        if red_state == 1:
+            target_1 = Cage2AggregateMDP.red_target_to_aggregate_target(s.red_agent_target)
+            target = target_1
+            if target_1 in [3, 4]:
+                target_2 = 1
+            else:
+                target_2 = 0
+        elif red_state >= 1:
+            target_1 = Cage2AggregateMDP.red_target_to_aggregate_target(s.red_action_targets[1])
+            if target_1 in [3, 4]:
+                target_2 = 1
+            else:
+                target_2 = 0
+            if red_state == 2:
+                target = target_1
+            if red_state in [3, 4, 5]:
+                if target_1 in [3, 4]:
+                    target = 1
+                else:
+                    target = 0
+            if red_state in [6, 7, 8, 9]:
+                target = 2
+            if red_state in [10, 11, 12, 13, 14]:
+                target = 7
+        decoy_state_str = ",".join(list(map(lambda x: str(x), decoy_state)))
+        state_str = f"{red_state},{target},{target_1},{target_2},{decoy_state_str}"
+        return state_to_id[state_str]
+
+    @staticmethod
+    def feasible_next_states(state_id, state_to_id, id_to_state, u):
+        """
+        Calculates the feasible set of next aggregate states when taking a control in a given aggregate state
+        """
+        B_LINE_AGENT_JUMPS = [0, 1, 2, 2, 2, 2, 5, 5, 5, 5, 9, 9, 9, 12, 13]
+        state = id_to_state[state_id]
+        red_state = state[0]
+        target = state[1]
+        target_1 = state[2]
+        target_2 = state[3]
+        decoy_state = state[4:]
+        feasible_next_red_states = []
+        if red_state in [0, 1, 3, 4, 6, 7, 8, 10, 11, 13]:
+            feasible_next_red_states = [red_state + 1]
+        if red_state == 14:
+            feasible_next_red_states = [14]
+        if red_state == 2:
+            feasible_next_red_states = [red_state, red_state + 1]
+        if red_state in [5, 9, 12]:
+            feasible_next_red_states = [red_state + 1, B_LINE_AGENT_JUMPS[red_state]]
+        feasible_next_targets = []
+        if target == -1 and red_state == 0:
+            feasible_next_targets = [3, 4, 5, 6]
+        if red_state in [1, 3, 4, 6, 7, 8, 10, 11, 13]:
+            feasible_next_targets = [target]
+        if target in [3, 4] and red_state == 2:
+            feasible_next_targets = [target] + [1]
+        elif target in [5, 6] and red_state == 2:
+            feasible_next_targets = [target] + [0]
+        elif target in [0, 1] and red_state == 5:
+            feasible_next_targets = [2, target_1]
+        elif target == 2 and red_state == 9:
+            feasible_next_targets = [7, target_2]
+        elif target == 7 and red_state == 12:
+            feasible_next_targets = [2, target]
+        next_decoy_state = decoy_state.copy()
+        if u == 0:
+            next_decoy_state[0] = min(next_decoy_state[0] + 1, 4)
+        elif u == 1:
+            next_decoy_state[1] = min(next_decoy_state[1] + 1, 1)
+        elif u == 2:
+            next_decoy_state[2] = min(next_decoy_state[2] + 1, 1)
+        elif u == 3:
+            next_decoy_state[3] = min(next_decoy_state[3] + 1, 4)
+        elif u == 4:
+            next_decoy_state[4] = min(next_decoy_state[4] + 1, 4)
+        elif u == 5:
+            next_decoy_state[5] = min(next_decoy_state[5] + 1, 2)
+        elif u == 6:
+            next_decoy_state[6] = min(next_decoy_state[6] + 1, 4)
+        feasible_next_target_1 = []
+        feasible_next_target_2 = []
+        if red_state == 0:
+            feasible_next_target_1.append(3)
+            feasible_next_target_1.append(4)
+            feasible_next_target_1.append(5)
+            feasible_next_target_1.append(6)
+            feasible_next_target_2.append(0)
+            feasible_next_target_2.append(1)
+        elif red_state == 1:
+            feasible_next_target_1 = [target]
+            if target in [3, 4]:
+                feasible_next_target_2 = [1]
+            else:
+                feasible_next_target_2 = [0]
+        else:
+            feasible_next_target_1 = [target_1]
+            feasible_next_target_2 = [target_2]
+        feasible_next_states = []
+
+        for feasible_target_1 in feasible_next_target_1:
+            for feasible_target_2 in feasible_next_target_2:
+                for feasible_red_state in feasible_next_red_states:
+                    for feasible_target in feasible_next_targets:
+                        if feasible_red_state == 1 and (feasible_target != feasible_target_1):
+                            continue
+                        if feasible_target_1 in [3, 4] and feasible_target_2 != 1:
+                            continue
+                        if feasible_target_1 in [5, 6] and feasible_target_2 != 0:
+                            continue
+                        if red_state == 5 and feasible_red_state == 2 and feasible_target != target_1:
+                            continue
+                        if red_state == 9 and feasible_red_state == 5 and feasible_target != target_2:
+                            continue
+                        if red_state == 12 and feasible_red_state == 9 and feasible_target != 2:
+                            continue
+                        if feasible_red_state == 6 and feasible_target == target_1:
+                            continue
+                        if feasible_red_state == 10 and feasible_target == target_2:
+                            continue
+                        if feasible_red_state == 13 and feasible_target == 2:
+                            continue
+                        if feasible_target != target and (feasible_red_state == red_state):
+                            continue
+                        if red_state in [2, 5, 9] and feasible_red_state != red_state and feasible_target == target:
+                            continue
+                        f_state = f"{feasible_red_state},{feasible_target},{feasible_target_1},{feasible_target_2}," \
+                                  f"{','.join(list(map(lambda x: str(x), next_decoy_state)))}"
+                        try:
+                            feasible_next_states.append(state_to_id[f_state])
+                        except Exception as e:
+                            print(type(e))
+                            print(f"curr state: {state}, next state: {f_state}")
+                            print(feasible_next_target_1)
+                            print(feasible_next_target_2)
+                            import sys
+                            sys.exit()
+        return feasible_next_states
+
+    @staticmethod
+    def exploit_success_probability(target, decoy_state):
+        """
+        Calculates the probability that an exploit against the given target is successful, given the decoy state
+        """
+        if target == 0:
+            return [1.0, 0.25, 0.1238899, 0.0838379, 0.083196][decoy_state]
+        elif target == 1:
+            return [1.0, 0.250012][decoy_state]
+        elif target == 2:
+            return [1.0, 0.25029][decoy_state]
+        elif target == 3:
+            return [1.0, 0.87456, 0.832632, 0.813176, 0.799056][decoy_state]
+        elif target == 4:
+            return [1.0, 0.250698, 0.166932, 0.124348, 0.09942][decoy_state]
+        elif target == 5:
+            return [1.0, 1.0, 0.93785][decoy_state]
+        elif target == 6:
+            return 1
+        elif target == 7:
+            return [1.0, 0.25038, 0.12526, 0.082999, 0.083086][decoy_state]
+
+    @staticmethod
+    def transition_probability(state_to_id, id_to_state, x, x_prime, u):
+        """
+        Calculates P(x_prime | x, u), where ,x_prime,x,u are aggregate states and controls
+        """
+        feasible_next_states = Cage2AggregateMDP.feasible_next_states(state_id=x, state_to_id=state_to_id,
+                                                                      id_to_state=id_to_state, u=u)
+        if x_prime not in feasible_next_states:
+            return 0
+        state = id_to_state[x]
+        state_prime = id_to_state[x_prime]
+        red_state = state[0]
+
+        if red_state == 0:
+            return 0.25
+
+        if red_state == 14:
+            return 1
+        if red_state in [1, 3, 4, 6, 7, 8, 10, 11, 13]:
+            return 1
+        target = state[1]
+        next_decoy_state = state_prime[4:]
+        next_target = state_prime[1]
+        next_red_state = state_prime[0]
+        next_target_1 = state_prime[2]
+        next_target_2 = state_prime[3]
+        if next_target in [3, 4, 5, 6] and next_target_1 != next_target:
+            return 0
+        if next_target in [0, 1] and next_target_2 != next_target:
+            return 0
+        if target == 6:
+            ds = -1
+        elif target == 7:
+            ds = next_decoy_state[6]
+        else:
+            ds = next_decoy_state[target]
+        exploit_success_prob = Cage2AggregateMDP.exploit_success_probability(target=target, decoy_state=ds)
+        if red_state == 12 and next_red_state == 13:
+            return exploit_success_prob
+        if red_state == 12 and next_red_state == 12:
+            return 1 - exploit_success_prob
+        if (next_red_state == red_state + 1) and next_target != target:
+            return exploit_success_prob
+        else:
+            return 1 - exploit_success_prob
+
+    @staticmethod
+    def aggregate_control_to_original_control():
+        """
+        Returns a dict that maps an aggregate control to the original control space
+        """
+        return {0: 27, 1: 28, 2: 29, 3: 30, 4: 31, 5: 32, 6: 35}
+
+    @staticmethod
+    def get_aggregate_decoy_state(decoy_state):
+        """
+        Converts a decoy state into an aggregate decoy state
+        """
+        ds = []
+        decoy_hosts = [1, 2, 3, 9, 10, 11, 12]
+        for dh in decoy_hosts:
+            ds.append(decoy_state[dh])
+        return ds
+
+    @staticmethod
+    def get_aggregate_decoy_state_space():
+        """
+        Gets the aggregate decoy state space
+        """
+        values = [range(5), range(2), range(2), range(5), range(5), range(3), range(5)]
+        return list(product(*values))
+
+    @staticmethod
+    def red_target_to_aggregate_target(target):
+        """
+        Converts a red target to the corresponding aggregate target
+        """
+        red_target_to_target_idx = {0: -1, 1: 0, 2: 1, 3: 2, 4: 0, 5: 0, 6: 0, 7: 7, 8: -1, 9: 3, 10: 4, 11: 5, 12: 6}
+        return red_target_to_target_idx[target]
+
+    @staticmethod
+    def aggregate_control_target_to_original_target(target):
+        """
+        Converts a red target to the corresponding aggregate target
+        """
+        aggregate_target_to_red_target = {0: 1, 1: 2, 2: 3, 3: 9, 4: 10, 5: 11, 6: 7}
+        return aggregate_target_to_red_target[target]
+
+    @staticmethod
+    def X():
+        """
+        Aggregate state space
+        """
+        decoy_states = Cage2AggregateMDP.get_aggregate_decoy_state_space()
+        state_to_id = {}
+        id_to_state = {}
+        X = []
+        state_id = 0
+        targets = [-1] + list(range(8))
+        target_1s = [-1, 3, 4, 5, 6]
+        target_2s = [-1, 0, 1]
+        for red_state in range(15):
+            for target_1 in target_1s:
+                for target_2 in target_2s:
+                    if target_1 in [3, 4] and target_2 != 1:
+                        continue
+                    if target_1 in [5, 6] and target_2 != 0:
+                        continue
+                    if target_2 != -1 and target_1 == -1:
+                        continue
+                    for target in targets:
+                        if target == -1 and (target_1 != -1 or target_2 != -1):
+                            continue
+                        if target != -1 and (target_1 == -1 or target_2 == -1):
+                            continue
+                        if red_state in [1, 2] and target not in [3, 4, 5, 6]:
+                            continue
+                        if red_state in [3, 4, 5] and target not in [0, 1]:
+                            continue
+                        if red_state in [6, 7, 8, 9] and target != 2:
+                            continue
+                        if red_state in [10, 11, 12, 13] and target != 7:
+                            continue
+                        if target in [3, 4, 5, 6] and target_1 != target:
+                            continue
+                        if target in [0, 1] and target_2 != target:
+                            continue
+                        if target in [3, 4, 5, 6] and red_state not in [0, 1, 2]:
+                            continue
+                        if target in [0, 1] and red_state not in [3, 4, 5]:
+                            continue
+                        if target == 2 and red_state not in [6, 7, 8, 9]:
+                            continue
+                        if target == 7 and red_state not in [10, 11, 12, 13, 14]:
+                            continue
+                        if red_state == 0 and (target != -1 or target_1 != -1 or target_2 != -1):
+                            continue
+                        if red_state >= 1 and (target_1 == -1 or target == -1 or target_2 == -1):
+                            continue
+                        for decoy_state in decoy_states:
+                            state_to_id[f"{red_state},{target},{target_1},{target_2}," \
+                                        f"{','.join(list(map(lambda x: str(x), list(decoy_state))))}"] = state_id
+                            id_to_state[state_id] = [red_state, target, target_1, target_2] + list(decoy_state)
+                            X.append(state_id)
+                            state_id += 1
+        return X, state_to_id, id_to_state
+
+    @staticmethod
+    def U():
+        """
+        Aggregate control space
+        """
+        return [0, 1, 2, 3, 4, 5, 6]
+
+    @staticmethod
+    def cost_function(x, id_to_state):
+        """
+        Aggregate cost function
+        """
+        red_state = id_to_state[x][0]
+        if red_state in [3, 4, 5]:
+            return 0.1
+        elif red_state in [6, 7, 8, 9]:
+            return 1
+        elif red_state in [10, 11, 12]:
+            return 2
+        elif red_state in [13, 14]:
+            return 3
+        return 0
+
+    @staticmethod
+    def vi(X, U, gamma, epsilon, verbose, state_to_id, id_to_state):
+        """
+        Value iteration
+        """
+        action_id_to_type_and_host, type_and_host_to_action_id \
+            = CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True,
+                                             decoy_optimization=False)
+        J = np.zeros(len(X))
+        iteration = 0
+        while True:
+            delta = 0
+            for x in X:
+                if x % 100000 == 0:
+                    print(f"{x}/{len(X)}")
+                u_star, J_u_star = Cage2AggregateMDP.TJx(x=x, J=J, U=U, gamma=gamma, state_to_id=state_to_id,
+                                                         id_to_state=id_to_state)
+                delta = max(delta, np.abs(J_u_star - J[x]))
+                J[x] = J_u_star
+            iteration += 1
+            if verbose:
+                print(f"VI iteration: {iteration}, delta: {delta}, epsilon: {epsilon}")
+                ssx = [state_to_id[f"0,-1,-1,-1,0,0,0,0,0,0,0"], state_to_id[f"1,3,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"2,3,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"3,1,3,1,0,0,0,0,0,0,0"], state_to_id[f"4,1,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"5,1,3,1,0,0,0,0,0,0,0"], state_to_id[f"6,2,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"7,2,3,1,0,0,0,0,0,0,0"], state_to_id[f"8,2,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"9,2,3,1,0,0,0,0,0,0,0"], state_to_id[f"10,7,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"11,7,3,1,0,0,0,0,0,0,0"], state_to_id[f"12,7,3,1,0,0,0,0,0,0,0"],
+                       state_to_id[f"13,7,3,1,0,0,0,0,0,0,0"], state_to_id[f"14,7,3,1,0,0,0,0,0,0,0"]]
+                for sx in ssx:
+                    u = Cage2AggregateMDP.TJx(x=sx, J=J, U=U, gamma=gamma, state_to_id=state_to_id,
+                                              id_to_state=id_to_state)[0]
+                    action = Cage2AggregateMDP.aggregate_control_to_original_control()[u]
+                    sts = id_to_state[sx]
+                    print(f"mu({sts})={action_id_to_type_and_host[action][1]}")
+            if delta < epsilon:
+                break
+        mu = Cage2AggregateMDP.policy(X=X, U=U, gamma=gamma, J=J, state_to_id=state_to_id, id_to_state=id_to_state)
+        return mu, J
+
+    @staticmethod
+    def TJx(x, J, U, gamma, state_to_id, id_to_state):
+        """
+        Implements the Bellman operator (TJ))(x)
+        """
+        Q_x = np.zeros(len(U))
+        for u in U:
+            feasible_x_prime = Cage2AggregateMDP.feasible_next_states(state_id=x, state_to_id=state_to_id,
+                                                                      id_to_state=id_to_state, u=u)
+            for x_prime in feasible_x_prime:
+                p = Cage2AggregateMDP.transition_probability(state_to_id=state_to_id, id_to_state=id_to_state, x=x,
+                                                             x_prime=x_prime, u=u)
+                c = Cage2AggregateMDP.cost_function(x=x, id_to_state=id_to_state)
+                Q_x[u] += p * (c + gamma * J[x_prime])
+        u_star = int(np.argmin(Q_x))
+        return u_star, Q_x[u_star]
+
+    @staticmethod
+    def policy(X, U, gamma, J, state_to_id, id_to_state):
+        """
+        Constructs a policy based on J
+        """
+        mu = np.zeros((len(X), len(U)))
+        for x in X:
+            mu[x][Cage2AggregateMDP.TJx(x=x, J=J, U=U, gamma=gamma, state_to_id=state_to_id,
+                                        id_to_state=id_to_state)[0]] = 1.0
+        return mu
+
+    @staticmethod
+    def run_vi():
+        """
+        Runs value iteration and saves the results to disk
+        """
+        X, state_to_id, id_to_state = Cage2AggregateMDP.X()
+        U = Cage2AggregateMDP.U()
+        gamma = 0.99
+        epsilon = 0.1
+        mu, J = Cage2AggregateMDP.vi(X=X, U=U, gamma=gamma, epsilon=epsilon, verbose=True, state_to_id=state_to_id,
+                                     id_to_state=id_to_state)
+        np.savetxt("mu1.txt", mu)
+        np.savetxt("J1.txt", J)
+
+    @staticmethod
+    def test():
+        """
+        Simulates N test trajectories of the aggregate MDP
+        """
+        X, state_to_id, id_to_state = Cage2AggregateMDP.X()
+        U = Cage2AggregateMDP.U()
+        N = 1000
+        for k in range(N):
+            x = state_to_id[f"0,-1,-1,0,0,0,0,0,0,0"]
+            while id_to_state[x][0] != 14:
+                u = random.choice(U)
+                feasible_states = Cage2AggregateMDP.feasible_next_states(state_id=x, state_to_id=state_to_id,
+                                                                         id_to_state=id_to_state, u=u)
+                probs = list(map(lambda x_prime: Cage2AggregateMDP.transition_probability(
+                    state_to_id=state_to_id, id_to_state=id_to_state, x=x, x_prime=x_prime, u=u), feasible_states))
+                print(f"u: {u}, x: {id_to_state[x]}")
+                x = np.random.choice(feasible_states, p=probs)
+
+
+if __name__ == '__main__':
+    pass
diff --git a/examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py b/examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py
new file mode 100644
index 000000000..3697b3fd2
--- /dev/null
+++ b/examples/eval/cyborg_scenario_two/eval_aggregate_mdp.py
@@ -0,0 +1,149 @@
+import numpy as np
+import copy
+from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
+from gym_csle_cyborg.dao.red_agent_type import RedAgentType
+from gym_csle_cyborg.dao.csle_cyborg_wrapper_config import CSLECyborgWrapperConfig
+from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
+from gym_csle_cyborg.dao.cyborg_wrapper_state import CyborgWrapperState
+from cyborg_agg_mdp import Cage2AggregateMDP
+
+
+# def monte_carlo_most_frequent(elements, num_samples):
+#     if not elements:
+#         raise ValueError("The input list is empty.")
+#
+#     # Perform random sampling
+#     samples = [random.choice(elements) for _ in range(num_samples)]
+#
+#     # Count occurrences of sampled elements
+#     counter = Counter(samples)
+#
+#     # Find the most common element
+#     most_frequent_element = counter.most_common(1)[0][0]
+#     return most_frequent_element
+
+# def particle_filter(particles, max_num_particles, train_env, action, obs):
+#     new_particles = []
+#     while len(particles) < max_num_particles:
+#         x = random.choice(particles)
+#         train_env.set_state(state=x)
+#         _, r, _, _, info = train_env.step(action)
+#         s_prime = info["s"]
+#         o = info["o"]
+#         if o == obs:
+#             new_particles.append(s_prime)
+#     return new_particles
+
+def restore_policy(s: CyborgWrapperState):
+    a = -1
+    if s.s[1][2] == 2:
+        a = 0 # Ent0
+    if s.s[2][2] == 2:
+        a = 1 # Ent 1
+    if s.s[3][2] == 2:
+        a = 2 # Ent 2
+    if s.s[7][2] == 2:
+        a = 3 # Opserver
+
+    if s.s[1][2] == 1:
+        a = 8 # Ent0
+    if s.s[2][2] == 1:
+        a = 9 # Ent1
+    if s.s[3][2] == 1:
+        a = 10 # Ent2
+    if s.s[3][2] == 1:
+        a = 11 # Opserver
+    if s.s[9][2] == 1:
+        a = 22 # User1
+    if s.s[10][2] == 1:
+        a = 23 # User2
+    if s.s[11][2] == 1:
+        a = 24 # User3
+    if s.s[12][2] == 1:
+        a = 25 # User4
+    return a
+
+def rollout(s: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, J, state_to_id, mu, l, gamma=0.99):
+    # U = [0, 1, 2, 3, 8, 9, 10, 11, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 35]
+    U = [27, 28, 29, 30, 31, 32, 35]
+    U = [27, 28, 29, 30, 31, 32]
+    Q_n = []
+    for u in U:
+        u_r = restore_policy(s=s)
+        if u_r != -1:
+            o, c, done, _, info = train_env.step(action=u_r)
+            s_prime = info["s"]
+            aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s_prime, state_to_id=state_to_id)
+            if l == 1:
+                return u_r, J[aggregate_state]
+            else:
+                returns = []
+                for i in range(2):
+                    returns.append(rollout(copy.deepcopy(s_prime), train_env=train_env, J=J, state_to_id=state_to_id, mu=mu, l=l-1)[1])
+                cost_to_go = np.mean(returns)
+        else:
+            train_env.set_state(s)
+            o, c, done, _, info = train_env.step(action=u)
+            s_prime = info["s"]
+            aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s_prime, state_to_id=state_to_id)
+            if l == 1:
+                cost_to_go = J[aggregate_state]
+            else:
+                returns = []
+                for i in range(2):
+                    returns.append(rollout(copy.deepcopy(s_prime), train_env=train_env, J=J, state_to_id=state_to_id, mu=mu, l=l-1)[1])
+                cost_to_go = np.mean(returns)
+        Q_n.append(-c + gamma*cost_to_go)
+    # print(Q_n)
+    # print(U[int(np.argmin(Q_n))])
+    u_star = int(np.argmin(Q_n))
+    return U[u_star], Q_n[u_star]
+
+
+if __name__ == '__main__':
+    config = CSLECyborgWrapperConfig(maximum_steps=100, gym_env_name="",
+                                     save_trace=False, reward_shaping=False, scenario=2,
+                                     red_agent_type=RedAgentType.B_LINE_AGENT)
+    env = CyborgScenarioTwoWrapper(config=config)
+    train_env = CyborgScenarioTwoWrapper(config=config)
+    action_id_to_type_and_host, type_and_host_to_action_id \
+        = CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True, decoy_optimization=False)
+    N = 10000
+    max_env_steps = 100
+    mu = np.loadtxt("./mu1.txt")
+    J = np.loadtxt("./J1.txt")
+    X, state_to_id, id_to_state = Cage2AggregateMDP.X()
+    gamma = 0.99
+    l = 3
+    returns = []
+    for i in range(N):
+        print(f"{i}/{N}")
+        done = False
+        _, info = env.reset()
+        s = info["s"]
+        t = 1
+        R = 0
+        particles = env.initial_particles
+        while not done and t < max_env_steps:
+            # monte_carlo_state = monte_carlo_most_frequent(elements=particles, num_samples=100)
+            aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s, state_to_id=state_to_id)
+            a = -1
+            a = restore_policy(s=s)
+
+            if t <= 1:
+                a = 31
+            if a == -1:
+                a = Cage2AggregateMDP.get_aggregate_control(mu=mu, aggregate_state=aggregate_state,
+                                                            id_to_state=id_to_state)
+                # print(f"base: {a}")
+                a = rollout(s=s, state_to_id=state_to_id, train_env=train_env, J=J, mu=mu, gamma=gamma, l=l)[0]
+                # print(f"rollout: {a}")
+            o, r, done, _, info = env.step(a)
+            # particles = particle_filter(particles=particles, max_num_particles=1000,
+            #                             train_env=train_env, action=a, obs=o)
+            s = info["s"]
+            t+= 1
+            R+= r
+            # print(f"t:{t}, r: {r}, a: {action_id_to_type_and_host[a]}, R: {R}, aggstate: {id_to_state[aggregate_state]}")
+        returns.append(R)
+        print(np.mean(returns))
\ No newline at end of file
diff --git a/examples/eval/cyborg_scenario_two/eval_on_base_env.py b/examples/eval/cyborg_scenario_two/eval_on_base_env.py
index 754683087..0cd7a1d53 100644
--- a/examples/eval/cyborg_scenario_two/eval_on_base_env.py
+++ b/examples/eval/cyborg_scenario_two/eval_on_base_env.py
@@ -20,17 +20,12 @@
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
-    # print(csle_cyborg_env.action_id_to_type_and_host)
-    # import sys
-    # sys.exit(0)
-    # print("Starting policy evaluation")
     for i in range(num_evaluations):
         o, _ = csle_cyborg_env.reset()
         R = 0
         t = 0
         while t < max_horizon:
-            # a = ppo_policy.action(o=o)
-            a = 4
+            a = ppo_policy.action(o=o)
             o, r, done, _, info = csle_cyborg_env.step(a)
             table = csle_cyborg_env.get_true_table()
             print(table)
@@ -38,4 +33,4 @@
             R += r
             t += 1
         returns.append(R)
-        # print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}")
+        print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}")
diff --git a/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py b/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py
index da39cea81..3bfc21d4b 100644
--- a/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py
+++ b/examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py
@@ -19,7 +19,6 @@
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
-    print("Starting policy evaluation")
     for i in range(num_evaluations):
         o, _ = env.reset()
         R = 0
diff --git a/examples/eval/cyborg_scenario_two/pomcp_eval.py b/examples/eval/cyborg_scenario_two/pomcp_eval.py
index e0e60c711..109ed3c9d 100644
--- a/examples/eval/cyborg_scenario_two/pomcp_eval.py
+++ b/examples/eval/cyborg_scenario_two/pomcp_eval.py
@@ -1,94 +1,112 @@
+from typing import List
 import numpy as np
-import torch
-import random
-import json
-import io
-from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig
-from gym_csle_cyborg.dao.red_agent_type import RedAgentType
-from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender
+import time
 from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
 from gym_csle_cyborg.dao.csle_cyborg_wrapper_config import CSLECyborgWrapperConfig
 from csle_agents.agents.pomcp.pomcp import POMCP
 from csle_agents.agents.pomcp.pomcp_acquisition_function_type import POMCPAcquisitionFunctionType
 import csle_agents.constants.constants as agents_constants
 from csle_common.logging.log import Logger
+from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
+from gym_csle_cyborg.dao.red_agent_type import RedAgentType
+
+
+def heuristic_value(o: List[List[int]]) -> float:
+    """
+    A heuristic value function
+
+    :param o: the observation vector
+    :return: the value
+    """
+    host_costs = CyborgEnvUtil.get_host_compromised_costs()
+    val = 0
+    for i in range(len(o)):
+        if o[i][2] > 0:
+            val += host_costs[i]
+    return val
+
 
 if __name__ == '__main__':
-    # ppo_policy = PPOPolicy(model=None, simulation_name="", save_path="")
-    config = CSLECyborgConfig(
-        gym_env_name="csle-cyborg-scenario-two-v1", scenario=2, baseline_red_agents=[RedAgentType.B_LINE_AGENT],
-        maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=True, decoy_state=True,
-        scanned_state=True, decoy_optimization=False, cache_visited_states=False)
-    eval_env = CyborgScenarioTwoDefender(config=config)
-    config = CSLECyborgWrapperConfig(maximum_steps=100, gym_env_name="",
-                                     save_trace=False, reward_shaping=False, scenario=2)
+    config = CSLECyborgWrapperConfig(
+        gym_env_name="csle-cyborg-scenario-two-wrapper-v1", maximum_steps=100, save_trace=False, scenario=2,
+        reward_shaping=True, red_agent_type=RedAgentType.B_LINE_AGENT)
+    eval_env = CyborgScenarioTwoWrapper(config=config)
     train_env = CyborgScenarioTwoWrapper(config=config)
+    action_id_to_type_and_host, type_and_host_to_action_id \
+        = CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True,
+                                         decoy_optimization=False)
 
-    num_evaluations = 10
-    max_horizon = 100
-    returns = []
-    seed = 215125
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
+    N = 5000
+    rollout_policy = lambda x, deterministic: 35
+    value_function = heuristic_value
     A = train_env.get_action_space()
-    gamma = 0.75
-    c = 1
-    print("Starting policy evaluation")
-    for i in range(num_evaluations):
+    gamma = 0.99
+    reinvigoration = False
+    reinvigorated_particles_ratio = 0.0
+    initial_particles = train_env.initial_particles
+    planning_time = 3.75
+    prune_action_space = False
+    max_particles = 1000
+    max_planning_depth = 50
+    max_rollout_depth = 4
+    c = 0.5
+    c2 = 15000
+    use_rollout_policy = False
+    prior_weight = 5
+    prior_confidence = 0
+    acquisition_function_type = POMCPAcquisitionFunctionType.UCB
+    log_steps_frequency = 1
+    max_negative_samples = 20
+    default_node_value = 0
+    verbose = False
+    eval_batch_size = 100
+    max_env_steps = 100
+    prune_size = 3
+    start = time.time()
+
+    # Run N episodes
+    returns = []
+    for i in range(N):
+        done = False
+        action_sequence = []
         _, info = eval_env.reset()
         s = info[agents_constants.COMMON.STATE]
         train_env.reset()
-        initial_particles = train_env.initial_particles
-        max_particles = 1000
-        planning_time = 60
-        value_function = lambda x: 0
-        reinvigoration = False
-        rollout_policy = False
-        verbose = False
-        default_node_value = 0
-        prior_weight = 1
-        acquisition_function_type = POMCPAcquisitionFunctionType.UCB
-        use_rollout_policy = False
-        reinvigorated_particles_ratio = False
-        prune_action_space = False
-        prune_size = 3
-        prior_confidence = 0
         pomcp = POMCP(A=A, gamma=gamma, env=train_env, c=c, initial_particles=initial_particles,
                       planning_time=planning_time, max_particles=max_particles, rollout_policy=rollout_policy,
                       value_function=value_function, reinvigoration=reinvigoration, verbose=verbose,
                       default_node_value=default_node_value, prior_weight=prior_weight,
-                      acquisition_function_type=acquisition_function_type, c2=1500,
+                      acquisition_function_type=acquisition_function_type, c2=c2,
                       use_rollout_policy=use_rollout_policy, prior_confidence=prior_confidence,
                       reinvigorated_particles_ratio=reinvigorated_particles_ratio,
                       prune_action_space=prune_action_space, prune_size=prune_size)
-        rollout_depth = 4
-        planning_depth = 100
         R = 0
-        t = 0
-        action_sequence = []
-        while t < max_horizon:
-            pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth)
+        t = 1
+
+        # Run episode
+        while not done and t <= max_env_steps:
+            rollout_depth = max_rollout_depth
+            planning_depth = max_planning_depth
+            pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth, t=t)
             action = pomcp.get_action()
-            o, r, done, _, info = eval_env.step(action)
+            o, _, done, _, info = eval_env.step(action)
+            r = info[agents_constants.COMMON.REWARD]
             action_sequence.append(action)
             s_prime = info[agents_constants.COMMON.STATE]
             obs_id = info[agents_constants.COMMON.OBSERVATION]
-            pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id)
-            print(eval_env.get_true_table())
-            print(eval_env.get_table())
+            pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id, t=t)
             R += r
             t += 1
-            Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action}, r: {r}, o: {obs_id}, "
-                                                f"s_prime: {s_prime},"
-                                                f", action sequence: {action_sequence}, R: {R}")
+            if t % log_steps_frequency == 0:
+                Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action_id_to_type_and_host[action]}, r: {r}, "
+                                                    f"action sequence: {action_sequence}, R: {round(R, 2)}")
+
+        # Logging
         returns.append(R)
-        print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}")
-        results = {}
-        results["seed"] = seed
-        results["training_time"] = 0
-        results["returns"] = returns
-        results["planning_time"] = planning_time
-        json_str = json.dumps(results, indent=4, sort_keys=True)
-        with io.open(f"/Users/kim/pomcp_{0}_60s.json", 'w', encoding='utf-8') as f:
-            f.write(json_str)
+        progress = round((i + 1) / N, 2)
+        time_elapsed_minutes = round((time.time() - start) / 60, 3)
+        Logger.__call__().get_logger().info(
+            f"[POMCP] episode: {i}, J:{R}, "
+            f"J_avg: {np.mean(returns)}, "
+            f"progress: {round(progress * 100, 2)}%, "
+            f"runtime: {time_elapsed_minutes} min")
diff --git a/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py b/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py
index 45589c5b3..d604f6917 100644
--- a/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py
+++ b/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py
@@ -57,6 +57,7 @@ class COMMON:
     EVALUATE_WITH_DISCOUNT = "evaluate_with_discount"
     STATE = "s"
     OBSERVATION = "o"
+    REWARD = "r"
 
 
 class PPO:
diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py
index 5f6c76b35..037354697 100644
--- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py
+++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py
@@ -31,6 +31,7 @@ class ENV_METRICS:
     ATTACKER_ACTION = "a2"
     OBSERVATION = "o"
     OBSERVATION_VECTOR = "obs_vec"
+    REWARD = "r"
     OBSERVATION_ID = "obs_id"
     TIME_STEP = "t"
     AVERAGE_UPPER_BOUND_RETURN = "average_upper_bound_return"
diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py
index 3f412f2f4..d73a65f79 100644
--- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py
+++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/dao/cyborg_wrapper_state.py
@@ -1,5 +1,6 @@
 from typing import List, Dict, Any, Union
 from csle_base.json_serializable import JSONSerializable
+import gym_csle_cyborg.constants.constants as env_constants
 
 
 class CyborgWrapperState(JSONSerializable):
@@ -75,7 +76,7 @@ def from_dict(d: Dict[str, Any]) -> "CyborgWrapperState":
         obj = CyborgWrapperState(
             s=d["s"], scan_state=d["scan_state"], op_server_restored=d["op_server_restored"], obs=d["obs"],
             red_action_targets=d["red_action_targets"],
-            privilege_escalation_detected=d["privilege_escalation_deteceted"], red_agent_state=d["red_agent_state"],
+            privilege_escalation_detected=d["privilege_escalation_detected"], red_agent_state=d["red_agent_state"],
             red_agent_target=d["red_agent_target"], attacker_observed_decoy=d["attacker_observed_decoy"],
             detected=d["detected"], malware_state=d["malware_state"], ssh_access=d["ssh_access"],
             escalated=d["escalated"], exploited=d["exploited"], bline_base_jump=d["bline_base_jump"],
@@ -133,3 +134,11 @@ def from_json_file(json_file_path: str) -> "CyborgWrapperState":
             json_str = f.read()
             dto = CyborgWrapperState.from_json_str(json_str=json_str)
             return dto
+
+    def get_decoy_state(self):
+        """
+        Extracts the decoy state
+
+        :return: a list with the decoy state of each host
+        """
+        return [host_state[env_constants.CYBORG.HOST_STATE_DECOY_IDX] for host_state in self.s]
\ No newline at end of file
diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py
index da7f256ff..674598ff8 100644
--- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py
+++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_wrapper.py
@@ -410,6 +410,7 @@ def step(self, action: int) -> Tuple[npt.NDArray[Any], float, bool, bool, Dict[s
         info[env_constants.ENV_METRICS.OBSERVATION] = CyborgEnvUtil.state_vector_to_state_id(
             state_vector=obs, observation=True)
         info[env_constants.ENV_METRICS.OBSERVATION_VECTOR] = obs
+        info[env_constants.ENV_METRICS.REWARD] = r
         self.scan_state = copy.deepcopy(scan_state)
         self.s = s_prime
         self.last_obs = copy.deepcopy(obs)