POMCP [WIP]

Limmen · Jan 18, 2024 · 8e4c52f · 8e4c52f
1 parent 633dec5
commit 8e4c52f
Show file tree

Hide file tree

Showing 19 changed files with 482 additions and 171 deletions.
diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ version of the documentation is available [here](./releases/)
 | Release                                                       | Last date of support |
 |---------------------------------------------------------------|----------------------|
 | [v.0.4.0](https://github.com/Limmen/csle/releases/tag/v0.4.0) | 2024-02-07           |
-| [v.0.3.0](https://github.com/Limmen/csle/releases/tag/v0.3.0) | 2024-01-17           |
+| [v.0.3.0](https://github.com/Limmen/csle/releases/tag/v0.3.0) | ~~2024-01-17~~       |
 | [v.0.2.0](https://github.com/Limmen/csle/releases/tag/v0.2.0) | ~~2023-10-30~~       |
 | [v.0.1.0](https://github.com/Limmen/csle/releases/tag/v0.1.0) | ~~2023-06-06~~       |
 

diff --git a/examples/manual_play/cyborg_restore_defender.py b/examples/manual_play/cyborg_restore_defender.py
@@ -1,28 +1,51 @@
 from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig
 from gym_csle_cyborg.dao.red_agent_type import RedAgentType
 from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender
-from gym_csle_cyborg.dao.blue_agent_action_type import BlueAgentActionType
+import gym_csle_cyborg.constants.constants as env_constants
 
 if __name__ == '__main__':
     config = CSLECyborgConfig(
         gym_env_name="csle-cyborg-scenario-two-v1", scenario=2, baseline_red_agents=[RedAgentType.B_LINE_AGENT],
-        maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=False, decoy_state=False,
-        scanned_state=False, decoy_optimization=False)
+        maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=True, decoy_state=True,
+        scanned_state=True, decoy_optimization=False)
     csle_cyborg_env = CyborgScenarioTwoDefender(config=config)
-    a = 1
-    R = 0
-    for t in range(1000):
-        o, r, done, _, info = csle_cyborg_env.step(a)
-        if done:
-            csle_cyborg_env.reset()
-        R += r
-        print(f"time-step: {t + 1}, cumulative reward: {R}, a: {a}")
-        for i in range(len(info["obs_per_host"])):
-            if csle_cyborg_env.cyborg_hostnames[i] == "User0":
-                continue
-            a = 1
-            if info["obs_per_host"][i]["compromised"].value > 0:
-                host = csle_cyborg_env.cyborg_hostnames[i]
-                action_type = BlueAgentActionType.RESTORE
-                a = csle_cyborg_env.cyborg_action_type_and_host_to_id[(action_type, host)]
-                break
+    o, info = csle_cyborg_env.reset()
+    initial_state_id = info[env_constants.ENV_METRICS.STATE]
+    csle_cyborg_env.step(1)
+    csle_cyborg_env.set_state(state=initial_state_id)
+    # print(csle_cyborg_env.cyborg_challenge_env.env.env.env.env.env.environment_controller.observation["Red"].data["User0"])
+    csle_cyborg_env.step(1)
+
+    # print("INITIAL2 STATE")
+    # print(csle_cyborg_env.get_true_table())
+    # # csle_cyborg_env.get_true_table()
+    # o, r, done, _, info = csle_cyborg_env.step(1)
+    # print("INITIAL1 STATE")
+    # print(csle_cyborg_env.get_true_table())
+    # initial_obs_id = info[env_constants.ENV_METRICS.OBSERVATION]
+    # initial_state_id = info[env_constants.ENV_METRICS.STATE]
+    # # csle_cyborg_env.set_state(state=initial_state_id)
+    # csle_cyborg_env.step(1)
+    # print("SECOND STATE")
+    # print(csle_cyborg_env.get_true_table())
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # print(csle_cyborg_env.get_true_table())
+    # print("SET STATE")
+    # csle_cyborg_env.set_state(state=initial_state_id)
+    # print(csle_cyborg_env.get_true_table())
+    # csle_cyborg_env.step(1)
+    # print(csle_cyborg_env.get_true_table())
+    # csle_cyborg_env.step(1)
+    # print(csle_cyborg_env.get_true_table())
+    # csle_cyborg_env.step(1)
+    # print(csle_cyborg_env.get_true_table())
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # csle_cyborg_env.step(1)
+    # print(csle_cyborg_env.get_true_table())
diff --git a/examples/manual_play/cyborg_test.py b/examples/manual_play/cyborg_test.py
@@ -2,78 +2,7 @@
 from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig
 from gym_csle_cyborg.dao.red_agent_type import RedAgentType
 from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender
-
-
-def info_to_vec(info, decoy_state, hosts):
-    """
-    Creates the state vector
-
-    :param info: the info
-    :param decoy_state: the decoy state
-    :param hosts: the host list
-    :return: the state vector
-    """
-    state_vec = []
-    for host in hosts:
-        known = info[host][3]
-        known = int(known)
-        scanned = info[host][4]
-        scanned = int(scanned)
-        access = info[host][5]
-        if access == "None":
-            access = 0
-        elif access == "User":
-            access = 1
-        else:
-            access = 2
-        d_state = len(decoy_state[host])
-        state_vec.append([known, scanned, access, d_state])
-    return state_vec
-
-
-def state_vec_to_id(state_vec):
-    """
-    Converts a state vector to an id
-
-    :param state_vec: the state vector to convert
-    :return: the id
-    """
-    bin_id = ""
-    for host_vec in state_vec:
-        host_bin_str = ""
-        for i, elem in enumerate(host_vec):
-            if i == 0:
-                host_bin_str += format(elem, '01b')
-            if i == 1:
-                host_bin_str += format(elem, '01b')
-            if i == 2:
-                host_bin_str += format(elem, '02b')
-            if i == 3:
-                host_bin_str += format(elem, '03b')
-        bin_id += host_bin_str
-    id = int(bin_id, 2)
-    return id
-
-
-def id_to_state_vec(id: int):
-    """
-    Converts an id to a state vector
-
-    :param id: the id to convert
-    :return: the state vector
-    """
-    bin_str = format(id, "091b")
-    host_bins = [bin_str[i:i + 7] for i in range(0, len(bin_str), 7)]
-    state_vec = []
-    for host_bin in host_bins:
-        known = int(host_bin[0:1], 2)
-        scanned = int(host_bin[1:2], 2)
-        access = int(host_bin[2:4], 2)
-        decoy = int(host_bin[4:7], 2)
-        host_vec = [known, scanned, access, decoy]
-        state_vec.append(host_vec)
-    return state_vec
-
+from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
 
 if __name__ == '__main__':
     config = CSLECyborgConfig(
@@ -84,23 +13,30 @@ def id_to_state_vec(id: int):
     str_info = str(csle_cyborg_env.cyborg_challenge_env.env.env.env.info)
     states = {}
     state_idx = 0
-    host_state_lookup = host_state_to_id(hostnames=csle_cyborg_env.cyborg_hostnames)
     host_ids = list(csle_cyborg_env.cyborg_hostname_to_id.values())
 
     for i in range(100000):
         done = False
         csle_cyborg_env.reset()
         actions = list(csle_cyborg_env.action_id_to_type_and_host.keys())
-        state_key = str(csle_cyborg_env.cyborg_challenge_env.env.env.env.info)
-        if state_key not in states:
-            states[state_key] = state_idx
+        state_id = str(csle_cyborg_env.cyborg_challenge_env.env.env.env.info)
+        if state_id not in states:
+            states[state_id] = state_idx
             state_idx += 1
 
         while not done:
             a = random.choice(actions)
             o, r, done, _, info = csle_cyborg_env.step(a)
-            state_vec = info_to_vec(csle_cyborg_env.get_true_table().rows, csle_cyborg_env.decoy_state,
-                                    host_state_lookup, host_ids)
-            state_key = state_vec_to_id(state_vec=state_vec)
-            stv = id_to_state_vec(id=state_key)
-            assert stv == state_vec
+            state_vector = CyborgEnvUtil.state_to_vector(state=csle_cyborg_env.get_true_table().rows,
+                                                         decoy_state=csle_cyborg_env.decoy_state, host_ids=host_ids,
+                                                         scan_state=csle_cyborg_env.scan_state)
+            state_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=state_vector)
+            converted_state_vector = CyborgEnvUtil.state_id_to_state_vector(state_id=state_id)
+            assert converted_state_vector == state_vector
+            obs_vector = CyborgEnvUtil.state_to_vector(state=csle_cyborg_env.get_table().rows,
+                                                       decoy_state=csle_cyborg_env.decoy_state,
+                                                       host_ids=host_ids, scan_state=csle_cyborg_env.scan_state,
+                                                       observation=True)
+            obs_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=obs_vector, observation=True)
+            converted_obs_vector = CyborgEnvUtil.state_id_to_state_vector(state_id=obs_id, observation=True)
+            assert converted_obs_vector == obs_vector
diff --git a/examples/training/pomcp/cyborg_scenario_two_defender/README.md b/examples/training/pomcp/cyborg_scenario_two_defender/README.md
@@ -0,0 +1,20 @@
+# POMCP for defender plannign in cyborg
+
+## Commands
+
+To run a script, execute:
+```bash
+python <script_name>
+```
+
+## Author & Maintainer
+
+Kim Hammar <kimham@kth.se>
+
+## Copyright and license
+
+[LICENSE](../../../../LICENSE.md)
+
+Creative Commons
+
+(C) 2020-2024, Kim Hammar
diff --git a/examples/training/pomcp/cyborg_scenario_two_defender/run_vs_random_attacker_v_001.py b/examples/training/pomcp/cyborg_scenario_two_defender/run_vs_random_attacker_v_001.py
@@ -0,0 +1,89 @@
+import csle_common.constants.constants as constants
+from csle_common.dao.training.experiment_config import ExperimentConfig
+from csle_common.metastore.metastore_facade import MetastoreFacade
+from csle_common.dao.training.agent_type import AgentType
+from csle_common.dao.training.hparam import HParam
+from csle_common.dao.training.player_type import PlayerType
+from csle_agents.agents.pomcp.pomcp_agent import POMCPAgent
+import csle_agents.constants.constants as agents_constants
+from csle_agents.common.objective_type import ObjectiveType
+from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig
+from gym_csle_cyborg.dao.red_agent_type import RedAgentType
+from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender
+
+if __name__ == '__main__':
+    emulation_name = "csle-level9-040"
+    emulation_env_config = MetastoreFacade.get_emulation_by_name(emulation_name)
+    if emulation_env_config is None:
+        raise ValueError(f"Could not find an emulation environment with the name: {emulation_name}")
+    simulation_name = "csle-cyborg-001"
+    simulation_env_config = MetastoreFacade.get_simulation_by_name(simulation_name)
+    if simulation_env_config is None:
+        raise ValueError(f"Could not find a simulation with name: {simulation_name}")
+    simulation_env_config.simulation_env_input_config = CSLECyborgConfig(
+        gym_env_name="csle-cyborg-scenario-two-v1", scenario=2, baseline_red_agents=[RedAgentType.B_LINE_AGENT],
+        maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=True, scanned_state=True,
+        decoy_state=True, decoy_optimization=False)
+    csle_cyborg_env = CyborgScenarioTwoDefender(config=simulation_env_config.simulation_env_input_config)
+    A = csle_cyborg_env.get_action_space()
+    b1 = csle_cyborg_env.initial_belief
+    experiment_config = ExperimentConfig(
+        output_dir=f"{constants.LOGGING.DEFAULT_LOG_DIR}pomcp_test", title="POMCP test",
+        random_seeds=[399, 98912, 999, 555],
+        agent_type=AgentType.POMCP,
+        log_every=1,
+        hparams={
+            agents_constants.POMCP.N: HParam(value=50, name=agents_constants.POMCP.N,
+                                             descr="the number of episodes"),
+            agents_constants.POMCP.OBJECTIVE_TYPE: HParam(
+                value=ObjectiveType.MAX, name=agents_constants.POMCP.OBJECTIVE_TYPE,
+                descr="the type of objective (max or min)"),
+            agents_constants.POMCP.ROLLOUT_POLICY: HParam(
+                value=None, name=agents_constants.POMCP.ROLLOUT_POLICY,
+                descr="the policy to use for rollouts"),
+            agents_constants.POMCP.VALUE_FUNCTION: HParam(
+                value=lambda x: 0, name=agents_constants.POMCP.VALUE_FUNCTION,
+                descr="the value function to use for truncated rollouts"),
+            agents_constants.POMCP.A: HParam(value=A, name=agents_constants.POMCP.A, descr="the action space"),
+            agents_constants.POMCP.GAMMA: HParam(value=0.99, name=agents_constants.POMCP.GAMMA,
+                                                 descr="the discount factor"),
+            agents_constants.POMCP.REINVIGORATION: HParam(value=False, name=agents_constants.POMCP.REINVIGORATION,
+                                                          descr="whether reinvigoration should be used"),
+            agents_constants.POMCP.INITIAL_BELIEF: HParam(value=b1, name=agents_constants.POMCP.INITIAL_BELIEF,
+                                                          descr="the initial belief"),
+            agents_constants.POMCP.PLANNING_TIME: HParam(value=300, name=agents_constants.POMCP.PLANNING_TIME,
+                                                         descr="the planning time"),
+            agents_constants.POMCP.MAX_PARTICLES: HParam(value=1000, name=agents_constants.POMCP.MAX_PARTICLES,
+                                                         descr="the maximum number of belief particles"),
+            agents_constants.POMCP.MAX_DEPTH: HParam(value=500, name=agents_constants.POMCP.MAX_DEPTH,
+                                                     descr="the maximum depth for planning"),
+            agents_constants.POMCP.C: HParam(value=0.35, name=agents_constants.POMCP.C,
+                                             descr="the weighting factor for UCB exploration"),
+            agents_constants.POMCP.LOG_STEP_FREQUENCY: HParam(
+                value=1, name=agents_constants.POMCP.LOG_STEP_FREQUENCY, descr="frequency of logging time-steps"),
+            agents_constants.POMCP.DEFAULT_NODE_VALUE: HParam(
+                value=-2000, name=agents_constants.POMCP.DEFAULT_NODE_VALUE, descr="the default node value in "
+                                                                                   "the search tree"),
+            agents_constants.POMCP.VERBOSE: HParam(value=True, name=agents_constants.POMCP.VERBOSE,
+                                                   descr="verbose logging flag"),
+            agents_constants.COMMON.EVAL_BATCH_SIZE: HParam(value=100, name=agents_constants.COMMON.EVAL_BATCH_SIZE,
+                                                            descr="number of evaluation episodes"),
+            agents_constants.COMMON.CONFIDENCE_INTERVAL: HParam(
+                value=0.95, name=agents_constants.COMMON.CONFIDENCE_INTERVAL,
+                descr="confidence interval"),
+            agents_constants.COMMON.MAX_ENV_STEPS: HParam(
+                value=500, name=agents_constants.COMMON.MAX_ENV_STEPS,
+                descr="maximum number of steps in the environment (for envs with infinite horizon generally)"),
+            agents_constants.COMMON.RUNNING_AVERAGE: HParam(
+                value=100, name=agents_constants.COMMON.RUNNING_AVERAGE,
+                descr="the number of samples to include when computing the running avg"),
+            agents_constants.COMMON.GAMMA: HParam(
+                value=0.99, name=agents_constants.COMMON.GAMMA,
+                descr="the discount factor")
+        },
+        player_type=PlayerType.DEFENDER, player_idx=0
+    )
+    agent = POMCPAgent(emulation_env_config=emulation_env_config, simulation_env_config=simulation_env_config,
+                       experiment_config=experiment_config, save_to_metastore=False)
+    experiment_execution = agent.train()
+    MetastoreFacade.save_experiment_execution(experiment_execution)
diff --git a/examples/training/pomcp/stopping_pomdp_defender/README.md b/examples/training/pomcp/stopping_pomdp_defender/README.md
@@ -1,4 +1,4 @@
-# Random Search for POMDP
+# POMCP for defenderp planning in stopping POMDP
 
 This directory contains example scripts for optimizing defender policies using random search for the POMDP from [https://ieeexplore.ieee.org/document/9779345](https://ieeexplore.ieee.org/document/9779345)
 

diff --git a/examples/training/pomcp/stopping_pomdp_defender/run_vs_random_attacker_v_001.py b/examples/training/pomcp/stopping_pomdp_defender/run_vs_random_attacker_v_001.py
@@ -53,6 +53,9 @@
     A = simulation_env_config.simulation_env_input_config.stopping_game_config.A1
     O = simulation_env_config.simulation_env_input_config.stopping_game_config.O
     b1 = simulation_env_config.simulation_env_input_config.stopping_game_config.b1
+    initial_belief = {}
+    for i in range(len(b1)):
+        initial_belief[i] = b1[i]
     rollout_policy = MultiThresholdStoppingPolicy(
         theta=[0.75], simulation_name=simulation_name, L=stopping_game_config.L,
         states=simulation_env_config.state_space_config.states, player_type=PlayerType.DEFENDER,
@@ -75,13 +78,14 @@
             agents_constants.POMCP.VALUE_FUNCTION: HParam(
                 value=lambda x: 0, name=agents_constants.POMCP.VALUE_FUNCTION,
                 descr="the value function to use for truncated rollouts"),
-            agents_constants.POMCP.S: HParam(value=S, name=agents_constants.POMCP.S, descr="the state space"),
-            agents_constants.POMCP.O: HParam(value=O, name=agents_constants.POMCP.O, descr="the observation space"),
             agents_constants.POMCP.A: HParam(value=A, name=agents_constants.POMCP.A, descr="the action space"),
             agents_constants.POMCP.GAMMA: HParam(value=0.99, name=agents_constants.POMCP.GAMMA,
                                                  descr="the discount factor"),
-            agents_constants.POMCP.INITIAL_BELIEF: HParam(value=b1, name=agents_constants.POMCP.INITIAL_BELIEF,
+            agents_constants.POMCP.INITIAL_BELIEF: HParam(value=initial_belief,
+                                                          name=agents_constants.POMCP.INITIAL_BELIEF,
                                                           descr="the initial belief"),
+            agents_constants.POMCP.REINVIGORATION: HParam(value=True, name=agents_constants.POMCP.REINVIGORATION,
+                                                          descr="whether reinvigoration should be used"),
             agents_constants.POMCP.PLANNING_TIME: HParam(value=120, name=agents_constants.POMCP.PLANNING_TIME,
                                                          descr="the planning time"),
             agents_constants.POMCP.MAX_PARTICLES: HParam(value=100, name=agents_constants.POMCP.MAX_PARTICLES,
@@ -90,8 +94,13 @@
                                                      descr="the maximum depth for planning"),
             agents_constants.POMCP.C: HParam(value=0.35, name=agents_constants.POMCP.C,
                                              descr="the weighting factor for UCB exploration"),
+            agents_constants.POMCP.DEFAULT_NODE_VALUE: HParam(
+                value=-2000, name=agents_constants.POMCP.DEFAULT_NODE_VALUE, descr="the default node value in "
+                                                                                   "the search tree"),
             agents_constants.POMCP.LOG_STEP_FREQUENCY: HParam(
                 value=1, name=agents_constants.POMCP.LOG_STEP_FREQUENCY, descr="frequency of logging time-steps"),
+            agents_constants.POMCP.VERBOSE: HParam(value=False, name=agents_constants.POMCP.VERBOSE,
+                                                   descr="verbose logging flag"),
             agents_constants.COMMON.EVAL_BATCH_SIZE: HParam(value=100, name=agents_constants.COMMON.EVAL_BATCH_SIZE,
                                                             descr="number of evaluation episodes"),
             agents_constants.COMMON.CONFIDENCE_INTERVAL: HParam(