From ad4e9bd539437ab1862e66b6b7d1c3f978825320 Mon Sep 17 00:00:00 2001
From: Kim Hammar <kimham@kth.se>
Date: Sat, 20 Jan 2024 21:08:47 +0100
Subject: [PATCH] POMCP [WIP]

---
 examples/manual_play/cyborg_test.py           | 42 ---------
 .../run_vs_random_attacker_v_001.py           |  7 +-
 .../run_vs_random_attacker_v_001.py           |  3 +
 .../src/csle_agents/agents/pomcp/pomcp.py     | 37 +++++---
 .../csle_agents/agents/pomcp/pomcp_agent.py   | 11 ++-
 .../csle_agents/agents/pomcp/pomcp_util.py    | 37 +++++++-
 .../src/csle_agents/constants/constants.py    |  1 +
 .../envs/cyborg_scenario_two_defender.py      | 87 +++++++++++++------
 .../gym_csle_cyborg/util/cyborg_env_util.py   |  4 +
 9 files changed, 145 insertions(+), 84 deletions(-)
 delete mode 100644 examples/manual_play/cyborg_test.py

diff --git a/examples/manual_play/cyborg_test.py b/examples/manual_play/cyborg_test.py
deleted file mode 100644
index fc4707da5..000000000
--- a/examples/manual_play/cyborg_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import random
-from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig
-from gym_csle_cyborg.dao.red_agent_type import RedAgentType
-from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender
-from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
-
-if __name__ == '__main__':
-    config = CSLECyborgConfig(
-        gym_env_name="csle-cyborg-scenario-two-v1", scenario=2, baseline_red_agents=[RedAgentType.B_LINE_AGENT],
-        maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=True, decoy_state=True,
-        scanned_state=True, decoy_optimization=False)
-    csle_cyborg_env = CyborgScenarioTwoDefender(config=config)
-    str_info = str(csle_cyborg_env.cyborg_challenge_env.env.env.env.info)
-    states = {}
-    state_idx = 0
-    host_ids = list(csle_cyborg_env.cyborg_hostname_to_id.values())
-
-    for i in range(100000):
-        done = False
-        csle_cyborg_env.reset()
-        actions = list(csle_cyborg_env.action_id_to_type_and_host.keys())
-        state_id = str(csle_cyborg_env.cyborg_challenge_env.env.env.env.info)
-        if state_id not in states:
-            states[state_id] = state_idx
-            state_idx += 1
-
-        while not done:
-            a = random.choice(actions)
-            o, r, done, _, info = csle_cyborg_env.step(a)
-            state_vector = CyborgEnvUtil.state_to_vector(state=csle_cyborg_env.get_true_table().rows,
-                                                         decoy_state=csle_cyborg_env.decoy_state, host_ids=host_ids,
-                                                         scan_state=csle_cyborg_env.scan_state)
-            state_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=state_vector)
-            converted_state_vector = CyborgEnvUtil.state_id_to_state_vector(state_id=state_id)
-            assert converted_state_vector == state_vector
-            obs_vector = CyborgEnvUtil.state_to_vector(state=csle_cyborg_env.get_table().rows,
-                                                       decoy_state=csle_cyborg_env.decoy_state,
-                                                       host_ids=host_ids, scan_state=csle_cyborg_env.scan_state,
-                                                       observation=True)
-            obs_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=obs_vector, observation=True)
-            converted_obs_vector = CyborgEnvUtil.state_id_to_state_vector(state_id=obs_id, observation=True)
-            assert converted_obs_vector == obs_vector
diff --git a/examples/training/pomcp/cyborg_scenario_two_defender/run_vs_random_attacker_v_001.py b/examples/training/pomcp/cyborg_scenario_two_defender/run_vs_random_attacker_v_001.py
index a16b114af..0923e68e1 100644
--- a/examples/training/pomcp/cyborg_scenario_two_defender/run_vs_random_attacker_v_001.py
+++ b/examples/training/pomcp/cyborg_scenario_two_defender/run_vs_random_attacker_v_001.py
@@ -51,7 +51,7 @@
                                                           descr="whether reinvigoration should be used"),
             agents_constants.POMCP.INITIAL_BELIEF: HParam(value=b1, name=agents_constants.POMCP.INITIAL_BELIEF,
                                                           descr="the initial belief"),
-            agents_constants.POMCP.PLANNING_TIME: HParam(value=300, name=agents_constants.POMCP.PLANNING_TIME,
+            agents_constants.POMCP.PLANNING_TIME: HParam(value=2000, name=agents_constants.POMCP.PLANNING_TIME,
                                                          descr="the planning time"),
             agents_constants.POMCP.MAX_PARTICLES: HParam(value=1000, name=agents_constants.POMCP.MAX_PARTICLES,
                                                          descr="the maximum number of belief particles"),
@@ -61,6 +61,9 @@
                                              descr="the weighting factor for UCB exploration"),
             agents_constants.POMCP.LOG_STEP_FREQUENCY: HParam(
                 value=1, name=agents_constants.POMCP.LOG_STEP_FREQUENCY, descr="frequency of logging time-steps"),
+            agents_constants.POMCP.MAX_NEGATIVE_SAMPLES: HParam(
+                value=20, name=agents_constants.POMCP.MAX_NEGATIVE_SAMPLES,
+                descr="maximum number of negative samples when filling belief particles"),
             agents_constants.POMCP.DEFAULT_NODE_VALUE: HParam(
                 value=-2000, name=agents_constants.POMCP.DEFAULT_NODE_VALUE, descr="the default node value in "
                                                                                    "the search tree"),
@@ -72,7 +75,7 @@
                 value=0.95, name=agents_constants.COMMON.CONFIDENCE_INTERVAL,
                 descr="confidence interval"),
             agents_constants.COMMON.MAX_ENV_STEPS: HParam(
-                value=500, name=agents_constants.COMMON.MAX_ENV_STEPS,
+                value=100, name=agents_constants.COMMON.MAX_ENV_STEPS,
                 descr="maximum number of steps in the environment (for envs with infinite horizon generally)"),
             agents_constants.COMMON.RUNNING_AVERAGE: HParam(
                 value=100, name=agents_constants.COMMON.RUNNING_AVERAGE,
diff --git a/examples/training/pomcp/stopping_pomdp_defender/run_vs_random_attacker_v_001.py b/examples/training/pomcp/stopping_pomdp_defender/run_vs_random_attacker_v_001.py
index bec0283bb..5c6feab75 100644
--- a/examples/training/pomcp/stopping_pomdp_defender/run_vs_random_attacker_v_001.py
+++ b/examples/training/pomcp/stopping_pomdp_defender/run_vs_random_attacker_v_001.py
@@ -94,6 +94,9 @@
                                                      descr="the maximum depth for planning"),
             agents_constants.POMCP.C: HParam(value=0.35, name=agents_constants.POMCP.C,
                                              descr="the weighting factor for UCB exploration"),
+            agents_constants.POMCP.MAX_NEGATIVE_SAMPLES: HParam(
+                value=200, name=agents_constants.POMCP.MAX_NEGATIVE_SAMPLES,
+                descr="maximum number of negative samples when filling belief particles"),
             agents_constants.POMCP.DEFAULT_NODE_VALUE: HParam(
                 value=-2000, name=agents_constants.POMCP.DEFAULT_NODE_VALUE, descr="the default node value in "
                                                                                    "the search tree"),
diff --git a/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp.py b/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp.py
index e2aeba42c..993f81090 100644
--- a/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp.py
+++ b/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp.py
@@ -3,12 +3,12 @@
 import numpy as np
 from csle_common.dao.simulation_config.base_env import BaseEnv
 from csle_common.dao.training.policy import Policy
+from csle_common.logging.log import Logger
 from csle_agents.agents.pomcp.belief_tree import BeliefTree
 from csle_agents.agents.pomcp.belief_node import BeliefNode
 from csle_agents.agents.pomcp.action_node import ActionNode
 from csle_agents.agents.pomcp.pomcp_util import POMCPUtil
 import csle_agents.constants.constants as constants
-from csle_common.logging.log import Logger
 
 
 class POMCP:
@@ -197,15 +197,23 @@ def get_action(self) -> int:
                                                     f"visit count: {a.visit_count}")
         return int(max(action_vals)[1])
 
-    def update_tree_with_new_samples(self, action: int, observation: int) -> Dict[int, float]:
+    def update_tree_with_new_samples(self, action_sequence: List[int], observation: int,
+                                     max_negative_samples: int = 20) -> Dict[int, float]:
         """
         Updates the tree after an action has been selected and a new observation been received
 
-        :param action: the action that was executed
+        :param action_sequence: the action sequence that was executed
         :param observation: the observation that was received
+        :param max_negative_samples: the maximum number of negative samples that can be collected before
+              trajectory simulation is initialized
         :return: the updated belief state
         """
+        observation = self.env.get_observation_id_from_vector(
+            observation_vector=self.env.get_observation_from_history(history=[observation]))
         root = self.tree.root
+        if len(action_sequence) == 0:
+            raise ValueError("Invalid action sequencee")
+        action = action_sequence[0]
 
         # Since we executed an action we advance the tree and update the root to the the node corresponding to the
         # action that was selected
@@ -241,19 +249,28 @@ def update_tree_with_new_samples(self, action: int, observation: int) -> Dict[in
             particle_slots = self.max_particles - len(new_root.particles)
         else:
             raise ValueError("Invalid root node")
+        negative_samples_count = 0
         if particle_slots > 0:
             # fill particles by Monte-Carlo using reject sampling
             particles = []
             while len(particles) < particle_slots:
                 if self.verbose:
                     Logger.__call__().get_logger().info(f"Filling particles {len(particles)}/{particle_slots}")
-                s = root.sample_state()
-                self.env.set_state(state=s)
-                _, r, _, _, info = self.env.step(action)
-                s_prime = info[constants.COMMON.STATE]
-                o = info[constants.COMMON.OBSERVATION]
-                if o == observation:
-                    particles.append(s_prime)
+                if negative_samples_count >= max_negative_samples:
+                    particles += POMCPUtil.trajectory_simulation_particles(
+                        o=observation, env=self.env, action_sequence=action_sequence, verbose=self.verbose,
+                        num_particles=(particle_slots - len(particles)))
+                else:
+                    s = root.sample_state()
+                    self.env.set_state(state=s)
+                    _, r, _, _, info = self.env.step(action)
+                    s_prime = info[constants.COMMON.STATE]
+                    o = info[constants.COMMON.OBSERVATION]
+                    if o == observation:
+                        particles.append(s_prime)
+                        negative_samples_count = 0
+                    else:
+                        negative_samples_count += 1
             new_root.particles += particles
 
         # We now prune the old root from the tree
diff --git a/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_agent.py b/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_agent.py
index 9369045e3..460f616aa 100644
--- a/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_agent.py
+++ b/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_agent.py
@@ -166,7 +166,7 @@ def hparam_names(self) -> List[str]:
                 agents_constants.POMCP.A, agents_constants.POMCP.GAMMA,
                 agents_constants.POMCP.INITIAL_BELIEF, agents_constants.POMCP.PLANNING_TIME,
                 agents_constants.POMCP.LOG_STEP_FREQUENCY, agents_constants.POMCP.VERBOSE,
-                agents_constants.POMCP.DEFAULT_NODE_VALUE,
+                agents_constants.POMCP.DEFAULT_NODE_VALUE, agents_constants.POMCP.MAX_NEGATIVE_SAMPLES,
                 agents_constants.POMCP.MAX_PARTICLES, agents_constants.POMCP.C, agents_constants.POMCP.MAX_DEPTH,
                 agents_constants.COMMON.EVAL_BATCH_SIZE, agents_constants.COMMON.CONFIDENCE_INTERVAL,
                 agents_constants.COMMON.RUNNING_AVERAGE, agents_constants.COMMON.MAX_ENV_STEPS]
@@ -188,6 +188,7 @@ def pomcp(self, exp_result: ExperimentResult, seed: int,
         log_steps_frequency = self.experiment_config.hparams[agents_constants.POMCP.LOG_STEP_FREQUENCY].value
         verbose = self.experiment_config.hparams[agents_constants.POMCP.VERBOSE].value
         default_node_value = self.experiment_config.hparams[agents_constants.POMCP.DEFAULT_NODE_VALUE].value
+        max_negative_samples = self.experiment_config.hparams[agents_constants.POMCP.MAX_NEGATIVE_SAMPLES].value
         max_env_steps = self.experiment_config.hparams[agents_constants.COMMON.MAX_ENV_STEPS].value
         N = self.experiment_config.hparams[agents_constants.POMCP.N].value
         A = self.experiment_config.hparams[agents_constants.POMCP.A].value
@@ -203,9 +204,8 @@ def pomcp(self, exp_result: ExperimentResult, seed: int,
 
         # Run N episodes
         for i in range(N):
-
-            # Setup environments
             done = False
+            action_sequence = []
             eval_env = gym.make(self.simulation_env_config.gym_env_name, config=config)
             train_env: BaseEnv = gym.make(self.simulation_env_config.gym_env_name, config=config)
             _, info = eval_env.reset()
@@ -225,9 +225,11 @@ def pomcp(self, exp_result: ExperimentResult, seed: int,
                 pomcp.solve(max_depth=max_depth)
                 action = pomcp.get_action()
                 _, r, done, _, info = eval_env.step(action)
+                action_sequence.append(action)
                 s_prime = info[agents_constants.COMMON.STATE]
                 o = info[agents_constants.COMMON.OBSERVATION]
-                belief = pomcp.update_tree_with_new_samples(action=action, observation=o)
+                belief = pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=o,
+                                                            max_negative_samples=max_negative_samples)
                 R += r
                 t += 1
                 if t % log_steps_frequency == 0:
@@ -235,6 +237,7 @@ def pomcp(self, exp_result: ExperimentResult, seed: int,
                     Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action}, r: {r}, o: {o}, "
                                                         f"s_prime: {s_prime}, b: {b}")
                     Logger.__call__().get_logger().info(f"action: {eval_env.action_id_to_type_and_host[action]}")
+                s = s_prime
 
             if i % self.experiment_config.log_every == 0:
                 # Logging
diff --git a/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_util.py b/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_util.py
index 7b767abc5..1df62cb6f 100644
--- a/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_util.py
+++ b/simulation-system/libs/csle-agents/src/csle_agents/agents/pomcp/pomcp_util.py
@@ -1,7 +1,10 @@
 from typing import List, Dict, Any
 import numpy as np
-from csle_agents.agents.pomcp.node import Node
 from collections import Counter
+from csle_common.logging.log import Logger
+from csle_common.dao.simulation_config.base_env import BaseEnv
+from csle_agents.agents.pomcp.node import Node
+import csle_agents.constants.constants as constants
 
 
 class POMCPUtil:
@@ -84,3 +87,35 @@ def ucb_acquisition_function(action: "Node", c: float) -> float:
         :return: the acquisition value of the action
         """
         return float(action.value + c * POMCPUtil.ucb(action.parent.visit_count, action.visit_count))
+
+    @staticmethod
+    def trajectory_simulation_particles(o: int, env: BaseEnv, action_sequence: List[int], num_particles: int,
+                                        verbose: bool = False) -> List[int]:
+        """
+        Performs trajectory simulations to find possible states matching to the given observation
+
+        :param o: the observation to match against
+        :param env: the black-box simulator to sue for generating trajectories
+        :param action_sequence: the action sequence for the trajectory
+        :param num_particles: the number of particles to collect
+        :param verbose: boolean flag indicating whether logging should be verbose or not
+        :return: the list of particles matching the given observation
+        """
+        particles: List[int] = []
+        while len(particles) < num_particles:
+            done = False
+            _, info = env.reset()
+            s = info[constants.COMMON.STATE]
+            t = 0
+            while not done and t < len(action_sequence):
+                _, r, done, _, info = env.step(action=action_sequence[t])
+                sampled_o = info[constants.COMMON.OBSERVATION]
+                if t == len(action_sequence) - 1 and sampled_o == o:
+                    particles.append(s)
+                s = info[constants.COMMON.STATE]
+                t += 1
+            if verbose:
+                Logger.__call__().get_logger().info(f"Filling particles {len(particles)}/{num_particles} "
+                                                    f"through trajectory simulations, "
+                                                    f"action sequence: {action_sequence}, observation: {o}")
+        return particles
diff --git a/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py b/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py
index 5f5c8adf8..e5d00724b 100644
--- a/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py
+++ b/simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py
@@ -559,6 +559,7 @@ class POMCP:
     REINVIGORATION = "reinvigoration"
     PLANNING_TIME = "planning_time"
     MAX_PARTICLES = "max_particles"
+    MAX_NEGATIVE_SAMPLES = "max_negative_samples"
     C = "c"
     MAX_DEPTH = "max_depth"
     LOG_STEP_FREQUENCY = "log_step_frequency"
diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_defender.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_defender.py
index aecc1609b..5f9d04feb 100644
--- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_defender.py
+++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/envs/cyborg_scenario_two_defender.py
@@ -31,18 +31,7 @@ def __init__(self, config: CSLECyborgConfig):
         self.config = config
 
         # Setup Cyborg Env
-        (cyborg_scenario_config_path, cyborg_challenge_env, cyborg_hostnames, cyborg_hostname_to_id,
-         cyborg_subnets, cyborg_subnet_to_id, cyborg_action_id_to_type_and_host, cyborg_action_type_and_host_to_id,
-         red_agent_type) = CyborgEnvUtil.setup_cyborg_env(config=self.config)
-        self.cyborg_scenario_config_path = cyborg_scenario_config_path
-        self.cyborg_challenge_env = cyborg_challenge_env
-        self.cyborg_hostnames = cyborg_hostnames
-        self.cyborg_hostname_to_id = cyborg_hostname_to_id
-        self.cyborg_subnets = cyborg_subnets
-        self.cyborg_subnet_to_id = cyborg_subnet_to_id
-        self.cyborg_action_id_to_type_and_host = cyborg_action_id_to_type_and_host
-        self.cyborg_action_type_and_host_to_id = cyborg_action_type_and_host_to_id
-        self.red_agent_type = red_agent_type
+        self.create_cyborg_env()
 
         # Setup defender decoy actions
         self.decoy_action_types = CyborgEnvUtil.get_decoy_action_types(scenario=self.config.scenario)
@@ -95,6 +84,25 @@ def __init__(self, config: CSLECyborgConfig):
         self.reset()
         super().__init__()
 
+    def create_cyborg_env(self) -> None:
+        """
+        Creates the cyborg environment
+
+        :return: None
+        """
+        (cyborg_scenario_config_path, cyborg_challenge_env, cyborg_hostnames, cyborg_hostname_to_id,
+         cyborg_subnets, cyborg_subnet_to_id, cyborg_action_id_to_type_and_host, cyborg_action_type_and_host_to_id,
+         red_agent_type) = CyborgEnvUtil.setup_cyborg_env(config=self.config)
+        self.cyborg_scenario_config_path = cyborg_scenario_config_path
+        self.cyborg_challenge_env = cyborg_challenge_env
+        self.cyborg_hostnames = cyborg_hostnames
+        self.cyborg_hostname_to_id = cyborg_hostname_to_id
+        self.cyborg_subnets = cyborg_subnets
+        self.cyborg_subnet_to_id = cyborg_subnet_to_id
+        self.cyborg_action_id_to_type_and_host = cyborg_action_id_to_type_and_host
+        self.cyborg_action_type_and_host_to_id = cyborg_action_type_and_host_to_id
+        self.red_agent_type = red_agent_type
+
     def step(self, action: int) -> Tuple[npt.NDArray[Any], float, bool, bool, Dict[str, Any]]:
         """
         Takes a step in the environment by executing the given action
@@ -217,18 +225,10 @@ def populate_info(self, info: Dict[str, Any], obs: npt.NDArray[Any], reset: bool
             info[env_constants.CYBORG.OBS_PER_HOST].append(host_obs)
             host_vector_obs.append(self.scan_state[i])
             info[env_constants.CYBORG.VECTOR_OBS_PER_HOST].append(host_vector_obs)
-        host_ids = list(self.cyborg_hostname_to_id.values())
-        state_vector = CyborgEnvUtil.state_to_vector(state=self.get_true_table().rows,
-                                                     decoy_state=self.decoy_state,
-                                                     host_ids=host_ids,
-                                                     scan_state=self.scan_state)
-        state_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=state_vector)
+        state_id = self.get_state_id()
+        obs_id = self.get_observation_id()
         if reset:
             self.initial_belief = {state_id: 1}
-        obs_vector = CyborgEnvUtil.state_to_vector(state=self.get_table().rows,
-                                                   decoy_state=self.decoy_state,
-                                                   host_ids=host_ids, scan_state=self.scan_state, observation=True)
-        obs_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=obs_vector, observation=True)
         info[env_constants.ENV_METRICS.STATE] = state_id
         info[env_constants.ENV_METRICS.OBSERVATION] = obs_id
         if state_id not in self.visited_cyborg_states:
@@ -242,7 +242,7 @@ def populate_info(self, info: Dict[str, Any], obs: npt.NDArray[Any], reset: bool
                  deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.done),
                  deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.reward),
                  deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.actions),
-                 deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.step),
+                 deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.steps),
                  deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.hostname_ip_map),
                  deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.subnet_cidr_map),
                  deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.observation),
@@ -250,7 +250,9 @@ def populate_info(self, info: Dict[str, Any], obs: npt.NDArray[Any], reset: bool
                  deepcopy(self.cyborg_challenge_env.env.env.env.success),
                  deepcopy(self.cyborg_challenge_env.env.env.env.baseline),
                  deepcopy(self.cyborg_challenge_env.env.env.env.info),
-                 deepcopy(self.cyborg_challenge_env.env.env.env.blue_info)
+                 deepcopy(self.cyborg_challenge_env.env.env.env.blue_info),
+                 deepcopy(self.cyborg_challenge_env.step_counter),
+                 deepcopy(self.cyborg_challenge_env.env.env.env.env.env.environment_controller.INFO_DICT),
                  )
             self.visited_scanned_states[state_id] = deepcopy(self.scan_state)
             self.visited_decoy_states[state_id] = deepcopy(self.decoy_state)
@@ -427,7 +429,7 @@ def set_state(self, state: Any) -> None:
                 deepcopy(self.visited_cyborg_states[s][4])
             self.cyborg_challenge_env.env.env.env.env.env.environment_controller.actions = \
                 deepcopy(self.visited_cyborg_states[s][5])
-            self.cyborg_challenge_env.env.env.env.env.env.environment_controller.step = \
+            self.cyborg_challenge_env.env.env.env.env.env.environment_controller.steps = \
                 deepcopy(self.visited_cyborg_states[s][6])
             self.cyborg_challenge_env.env.env.env.env.env.environment_controller.hostname_ip_map = \
                 deepcopy(self.visited_cyborg_states[s][7])
@@ -440,6 +442,9 @@ def set_state(self, state: Any) -> None:
             self.cyborg_challenge_env.env.env.env.baseline = deepcopy(self.visited_cyborg_states[s][12])
             self.cyborg_challenge_env.env.env.env.info = deepcopy(self.visited_cyborg_states[s][13])
             self.cyborg_challenge_env.env.env.env.blue_info = deepcopy(self.visited_cyborg_states[s][14])
+            self.cyborg_challenge_env.step_counter = deepcopy(self.visited_cyborg_states[s][15])
+            self.cyborg_challenge_env.env.env.env.env.env.environment_controller.INFO_DICT = \
+                deepcopy(self.visited_cyborg_states[s][16])
             self.decoy_state = deepcopy(self.visited_decoy_states[s])
             self.scan_state = deepcopy(self.visited_scanned_states[s])
             self.cyborg_challenge_env.env.env.env.env.observation_change(obs)
@@ -494,3 +499,35 @@ def get_state_from_id(self, state_id: int) -> List[List[int]]:
         :return: the observation vector
         """
         return CyborgEnvUtil.state_id_to_state_vector(state_id=state_id, observation=False)
+
+    def get_observation_id_from_vector(self, observation_vector: List[Any]) -> int:
+        """
+        Converts an observation vector to an id
+
+        :param observation_vector: the vector to convert
+        :return: the observation id
+        """
+        return CyborgEnvUtil.state_vector_to_state_id(state_vector=observation_vector, observation=True)
+
+    def get_observation_id(self) -> int:
+        """
+        :return: the current observation id
+        """
+        host_ids = list(self.cyborg_hostname_to_id.values())
+        obs_vector = CyborgEnvUtil.state_to_vector(state=self.get_table().rows,
+                                                   decoy_state=self.decoy_state,
+                                                   host_ids=host_ids, scan_state=self.scan_state, observation=True)
+        obs_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=obs_vector, observation=True)
+        return obs_id
+
+    def get_state_id(self) -> int:
+        """
+        :return: the current state id
+        """
+        host_ids = list(self.cyborg_hostname_to_id.values())
+        state_vector = CyborgEnvUtil.state_to_vector(state=self.get_true_table().rows,
+                                                     decoy_state=self.decoy_state,
+                                                     host_ids=host_ids,
+                                                     scan_state=self.scan_state)
+        state_id = CyborgEnvUtil.state_vector_to_state_id(state_vector=state_vector)
+        return state_id
diff --git a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/util/cyborg_env_util.py b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/util/cyborg_env_util.py
index 7899d8a5f..ee75aa20f 100644
--- a/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/util/cyborg_env_util.py
+++ b/simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/util/cyborg_env_util.py
@@ -336,6 +336,8 @@ def state_to_vector(state: List[List[Any]], decoy_state: List[List[BlueAgentActi
                 host_access = 1
             if host_access == "Privileged":
                 host_access = 2
+            if host_access == "Unknown":
+                host_access = 3
             host_decoy_state = len(decoy_state[host_id])
             if not observation:
                 state_vector.append([host_known, host_scanned, host_access, host_decoy_state])
@@ -353,6 +355,7 @@ def state_vector_to_state_id(state_vector: List[List[int]], observation: bool =
         :return: the id
         """
         binary_id_str = ""
+        host_bins = []
         for host_vec in state_vector:
             host_binary_id_str = ""
             for i, elem in enumerate(host_vec):
@@ -371,6 +374,7 @@ def state_vector_to_state_id(state_vector: List[List[int]], observation: bool =
                 if i == 3:
                     host_binary_id_str += format(elem, '03b')
             binary_id_str += host_binary_id_str
+            host_bins.append(host_binary_id_str)
         state_id = int(binary_id_str, 2)
         return state_id