aggregate cage-2 mdp

Limmen · Jan 11, 2025 · 68531fb · 68531fb
1 parent 325f4d9
commit 68531fb
Show file tree

Hide file tree

Showing 7 changed files with 296 additions and 868 deletions.
diff --git a/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py b/examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py
diff --git a/examples/eval/cyborg_scenario_two/eval_aggregate_mdp_on_wrapper.py b/examples/eval/cyborg_scenario_two/eval_aggregate_mdp_on_wrapper.py
@@ -1,7 +1,6 @@
+from typing import List, Dict, Tuple
 import random
 import numpy as np
-import copy
-import math
 from collections import Counter
 from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
 from gym_csle_cyborg.dao.red_agent_type import RedAgentType
@@ -12,24 +11,36 @@
 import csle_agents.constants.constants as agents_constants
 
 
-def monte_carlo_most_frequent_particle(particles, N):
+def monte_carlo_most_frequent_particle(particles: List[CyborgWrapperState], N: int) -> CyborgWrapperState:
     """
     Samples N particles and returns the most frequently sampled particle
+
+    :param particles: the list of particles
+    :param N: the number of samples
+    :return: the most frequently sampled particle
     """
     samples = [random.choice(particles) for _ in range(N)]
     counter = Counter(samples)
     most_frequent_particle = counter.most_common(1)[0][0]
     return most_frequent_particle
 
 
-def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
+def particle_filter(particles: List[CyborgWrapperState], max_num_particles: int, train_env: CyborgScenarioTwoWrapper,
+                    u: int, obs: int, x_s: CyborgWrapperState) -> List[CyborgWrapperState]:
     """
     Implements a particle filter
+
+    :param particles: the list of particles
+    :param max_num_particles: the maximum number of particles
+    :param train_env: the environment used for sampling
+    :param u: the latest control
+    :param obs: the latest observation
+    :param x_s: the true cyborg state
+    :return: the list of updated particles
     """
     new_particles = []
     failed_samples = 0
     while len(new_particles) < max_num_particles:
-        # print(f"{len(new_particles)}/{max_num_particles}")
         x = random.choice(particles)
         train_env.set_state(state=x)
         _, _, _, _, info = train_env.step(u)
@@ -46,74 +57,96 @@ def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
     return new_particles
 
 
-def restore_policy(x: CyborgWrapperState):
+def restore_policy(x: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, particles: List[CyborgWrapperState]) \
+        -> int:
     """
     Implements a heuristic restore policy for Cage2
+
+    :param x: the certainty-equivalence state
+    :param train_env: the environment used for simulation
+    :param particles: the current list of particles
+    :return: the control
     """
     u = -1
-    if x.s[1][2] == 2:
-        u = 0  # Ent0
-    if x.s[2][2] == 2:
-        u = 1  # Ent 1
-    if x.s[3][2] == 2:
-        u = 2  # Ent 2
-    if x.s[7][2] == 2:
-        u = 3  # Opserver
-
-    if x.s[1][2] == 1:
-        u = 8  # Ent0
-    if x.s[2][2] == 1:
-        u = 9  # Ent1
-    if x.s[3][2] == 1:
-        u = 10  # Ent2
-    if x.s[3][2] == 1:
-        u = 11  # Opserver
-    if x.s[9][2] == 1:
-        u = 22  # User1
-    if x.s[10][2] == 1:
-        u = 23  # User2
-    if x.s[11][2] == 1:
-        u = 24  # User3
-    if x.s[12][2] == 1:
-        u = 25  # User4
+    restore_actions = [0, 1, 2, 3]
+    remove_actions = [8, 9, 10, 11, 22, 23, 24, 25]
+    remove_hosts = [1, 2, 3, 7, 9, 10, 11, 12]
+    restore_hosts = [1, 2, 3, 7]
+    outcomes = {}
+    for h in remove_hosts:
+        outcomes[h] = []
+    for i, host in enumerate(remove_hosts):
+        for p in particles:
+            if p.s[host][2] == 1:
+                train_env.set_state(p)
+                train_env.step(action=remove_actions[i])
+                if train_env.s[host][2] == 0:
+                    outcomes[host].append(1)
+                else:
+                    outcomes[host].append(0)
+    for i, h in enumerate(remove_hosts):
+        if len(outcomes[h]) > 0:
+            remove_p = np.mean(outcomes[h])
+            if remove_p >= 0.9:
+                return remove_actions[i]
+    for i, host in enumerate(restore_hosts):
+        if x.s[host][2] > 0:
+            return restore_actions[i]
     return u
 
 
-def rollout_policy(x: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, J, state_to_id, mu, l, id_to_state,
-                   gamma=0.99, mc_samples=10):
+def rollout_policy(train_env: CyborgScenarioTwoWrapper, J: List[float], state_to_id: Dict[str, int],
+                   mu: List[List[float]], l: int, id_to_state: Dict[int, List[int]],
+                   particles: List[CyborgWrapperState], gamma=0.99, mc_samples=10) -> Tuple[int, float]:
     """
     A rollout policy for cage-2
+
+    :param train_env: the environment to use for sampling
+    :param J: the cost-to-go function of the base policy
+    :param state_to_id: the aggreate state to aggregate state id map
+    :param mu: the base policy
+    :param l: the lookahead horizon
+    :param id_to_state: the aggregate state id to aggregate state map
+    :param particles: the current particle state
+    :param gamma: the discount factor
+    :param mc_samples: the number of Monte-Carlo samples to use
+    :return: the next control and its estimated value
     """
     U = [27, 28, 29, 30, 31, 32, 35]
     Q_n = []
-    u_r = restore_policy(x=x)
     for u in U:
         returns = []
         for i in range(mc_samples):
-            train_env.set_state(x)
+            particle = random.choice(particles)
+            train_env.set_state(particle)
             _, _, _, _, info = train_env.step(action=u)
             x_prime = info[agents_constants.COMMON.STATE]
             aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=x_prime, state_to_id=state_to_id)
-            # c = -info[agents_constants.COMMON.REWARD]
-            c = Cage2AggregateMDP.cost_function(x=aggregate_state, u=U.index(u), id_to_state=id_to_state)
+            c = -info[agents_constants.COMMON.REWARD]
             if l == 1:
                 returns.append(c + gamma * J[aggregate_state])
             else:
-                returns.append(c + gamma * rollout_policy(copy.deepcopy(x_prime), train_env=train_env, J=J,
+                returns.append(c + gamma * rollout_policy(train_env=train_env, J=J,
                                                           state_to_id=state_to_id, id_to_state=id_to_state,
                                                           mu=mu, l=l - 1)[1])
         Q_n.append(np.mean(returns))
     u_star = int(np.argmin(Q_n))
-    J_star = Q_n[u_star]
+    J_star = float(Q_n[u_star])
     u_star = U[u_star]
+    u_r = restore_policy(x=x, train_env=train_env, particles=particles)
     if u_r != -1:
         u_star = u_r
     return u_star, J_star
 
 
-def base_policy(x, mu, id_to_state):
+def base_policy(x: CyborgWrapperState, mu: List[List[float]], id_to_state: Dict[int, List[int]]) -> int:
     """
     Implements the base policy mu
+
+    :param x: the current state id
+    :param mu: the base policy
+    :param id_to_state: the aggregate state id to aggregate state map
+    :return: the next control
     """
     aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)
     return Cage2AggregateMDP.get_aggregate_control(mu=mu, aggregate_state=aggregate_state, id_to_state=id_to_state)
@@ -130,8 +163,8 @@ def base_policy(x, mu, id_to_state):
                                          decoy_optimization=False)
     N = 10000
     max_env_steps = 100
-    mu = np.loadtxt("./mu.txt")
-    J = np.loadtxt("./J.txt")
+    mu = np.loadtxt("test/mu2.txt")
+    J = np.loadtxt("test/J2.txt")
     X, state_to_id, id_to_state = Cage2AggregateMDP.X()
     gamma = 0.99
     l = 1
@@ -145,21 +178,24 @@ def base_policy(x, mu, id_to_state):
         particles = env.initial_particles
         while not done and t < max_env_steps:
             monte_carlo_state = monte_carlo_most_frequent_particle(particles=particles, N=100)
-            # u = restore_policy(x=x)
-            u = restore_policy(x=monte_carlo_state)
+            u = restore_policy(x=monte_carlo_state, train_env=train_env, particles=particles)
+            if t <= 2:
+                u = 31
             if u == -1:
                 # u = base_policy(x=monte_carlo_state, mu=mu, id_to_state=id_to_state)
-                # u = base_policy(x=x, mu=mu, id_to_state=id_to_state)
-                u = rollout_policy(x=x, state_to_id=state_to_id, id_to_state=id_to_state,
-                                   train_env=train_env, J=J, mu=mu, gamma=gamma, l=l)[0]
+                u = rollout_policy(state_to_id=state_to_id, id_to_state=id_to_state, train_env=train_env, J=J, mu=mu,
+                                   gamma=gamma, l=l, particles=particles, mc_samples=20)[0]
             _, _, _, _, info = env.step(u)
             particles = particle_filter(particles=particles, max_num_particles=50,
                                         train_env=train_env, u=u, obs=info[agents_constants.COMMON.OBSERVATION],
                                         x_s=info[agents_constants.COMMON.STATE])
             c = -info[agents_constants.COMMON.REWARD]
-            C += math.pow(gamma, t - 1) * c
-            print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
-                  f"aggstate: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
+            C += c
+            # aggstate = id_to_state[Cage2AggregateMDP.get_aggregate_state(s=monte_carlo_state,
+            #                                                               state_to_id=state_to_id)]
+            # print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
+            #       f"aggstate: {aggstate},"
+            #       f"true state: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
             x = info[agents_constants.COMMON.STATE]
             t += 1
         returns.append(C)

diff --git a/examples/eval/cyborg_scenario_two/test/.gitignore b/examples/eval/cyborg_scenario_two/test/.gitignore