From fbcaa1021196fcaaa15ac1b473dfef961cd21023 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 09:24:35 +0000
Subject: [PATCH 01/29] added part of vm scheduling RL code

---
 examples/rl/cim/ac.py                      |   2 +-
 examples/rl/cim/env_wrapper.py             |   5 +-
 examples/rl/vm_scheduling/README.md        |  10 ++
 examples/rl/vm_scheduling/__init__.py      |   8 +
 examples/rl/vm_scheduling/ac.py            |  97 +++++++++++
 examples/rl/vm_scheduling/agent_wrapper.py |  35 ++++
 examples/rl/vm_scheduling/env_wrapper.py   | 187 +++++++++++++++++++++
 examples/rl/vm_scheduling/policy_index.py  |  16 ++
 maro/rl/learning/env_wrapper.py            |   2 -
 9 files changed, 357 insertions(+), 5 deletions(-)
 create mode 100644 examples/rl/vm_scheduling/README.md
 create mode 100644 examples/rl/vm_scheduling/__init__.py
 create mode 100644 examples/rl/vm_scheduling/ac.py
 create mode 100644 examples/rl/vm_scheduling/agent_wrapper.py
 create mode 100644 examples/rl/vm_scheduling/env_wrapper.py
 create mode 100644 examples/rl/vm_scheduling/policy_index.py

diff --git a/examples/rl/cim/ac.py b/examples/rl/cim/ac.py
index 1b27ca6e4..3b2aa8b03 100644
--- a/examples/rl/cim/ac.py
+++ b/examples/rl/cim/ac.py
@@ -11,7 +11,7 @@
 from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
 from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
 
-cim_path = os.path.dirname(os.path.dirname(__file__))
+cim_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, cim_path)
 from env_wrapper import STATE_DIM, env_config
 
diff --git a/examples/rl/cim/env_wrapper.py b/examples/rl/cim/env_wrapper.py
index 28ab6bb1a..68e3cebb5 100644
--- a/examples/rl/cim/env_wrapper.py
+++ b/examples/rl/cim/env_wrapper.py
@@ -30,6 +30,7 @@ def __init__(
             (self.look_back + 1) * (self.max_ports_downstream + 1) * len(self.port_attributes)
             + len(self.vessel_attributes)
         )
+        self._state_info = None
 
     @property
     def state_dim(self):
@@ -44,7 +45,7 @@ def get_state(self, tick=None):
         future_port_idx_list = vessel_snapshots[tick: vessel_idx: 'future_stop_list'].astype('int')
         port_features = port_snapshots[ticks: [port_idx] + list(future_port_idx_list): self.port_attributes]
         vessel_features = vessel_snapshots[tick: vessel_idx: self.vessel_attributes]
-        self.state_info = {
+        self._state_info = {
             port_idx: {
                 "tick": tick,
                 "action_scope": self.event.action_scope,
@@ -59,7 +60,7 @@ def get_state(self, tick=None):
     def to_env_action(self, action_by_agent: dict):
         env_action = {}
         for agent_id, action_info in action_by_agent.items():
-            state_info = self.state_info[agent_id]
+            state_info = self._state_info[agent_id]
             tick, port, vessel, action_scope = (
                 state_info["tick"], state_info["port_idx"], state_info["vessel_idx"], state_info["action_scope"]
             )
diff --git a/examples/rl/vm_scheduling/README.md b/examples/rl/vm_scheduling/README.md
new file mode 100644
index 000000000..2d280f476
--- /dev/null
+++ b/examples/rl/vm_scheduling/README.md
@@ -0,0 +1,10 @@
+# Container Inventory Management
+
+Container inventory management (CIM) is a scenario where reinforcement learning (RL) can potentially prove useful. In this folder you can find:
+* ``env_wrapper.py``, which contains a function to generate an environment wrapper to interact
+with our "agent" (see below);
+* ``agent_wrapper.py``, which contains a function to generate an agent wrapper to interact
+with the environment wrapper;
+* ``policy_index``, which maps policy names to functions that create them; the functions to create DQN and Actor-Critic policies are defined in ``dqn.py`` and ``ac.py``, respectively.
+
+The code for the actual learning workflows (e.g., learner, roll-out worker and trainer) can be found under ``examples/rl/workflows``. The reason for putting it in a separate folder is that these workflows apply to any scenario, so long as the necessary component generators, such as the ones listed above, are provided. See ``README`` under ``examples/rl`` for details. We recommend that you follow this example to write your own scenarios.
\ No newline at end of file
diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
new file mode 100644
index 000000000..cddfbf182
--- /dev/null
+++ b/examples/rl/vm_scheduling/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .agent_wrapper import get_agent_wrapper
+from .env_wrapper import get_env_wrapper
+from .policy_index import policy_func_index, update_trigger, warmup
+
+__all__ = ["get_agent_wrapper", "get_env_wrapper", "policy_func_index", "update_trigger", "warmup"]
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
new file mode 100644
index 000000000..0e9e8704b
--- /dev/null
+++ b/examples/rl/vm_scheduling/ac.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+import numpy as np
+import torch
+
+from maro.rl.experience import ExperienceManager
+from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
+from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
+
+vm_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, vm_path)
+from env_wrapper import STATE_DIM
+
+config = {
+    "model": {
+        "network": {
+            "actor": {
+                "input_dim": STATE_DIM,
+                "output_dim": 9,
+                "hidden_dims": [64, 32, 32],
+                "activation": "leaky_relu",
+                "softmax": True,
+                "batch_norm": False,
+                "head": True
+            },
+            "critic": {
+                "input_dim": STATE_DIM,
+                "output_dim": 1,
+                "hidden_dims": [256, 128, 64],
+                "activation": "leaky_relu",
+                "softmax": False,
+                "batch_norm": False,
+                "head": True
+            }
+        },
+        "optimization": {
+            "actor": {
+                "optim_cls": "adam",
+                "optim_params": {"lr": 0.0001}
+            },
+            "critic": {
+                "optim_cls": "sgd",
+                "optim_params": {"lr": 0.001}
+            }
+        }
+    }
+}
+
+
+class MyACNet(DiscreteACNet):
+    def forward(self, states, actor: bool = True, critic: bool = True):
+        inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
+        
+        if len(states.shape) == 1:
+            states = states.unsqueeze(dim=0)
+        return (
+            self.component["actor"](inputs) if actor else None,
+            self.component["critic"](inputs) if critic else None
+        )
+
+    def get_action(self, states, training=True):
+        """
+        Given Q-values for a batch of states, return the action index and the corresponding maximum Q-value
+        for each state.
+        """
+        states, legal_action = states
+        legal_action = torch.from_numpy(np.asarray(legal_action)).to(self.device)
+
+        if not training:
+            action_prob = self.forward(states, critic=False)[0]
+            _, action = (action_prob + (legal_action - 1) * 1e8).max(dim=1)
+            return action, action_prob
+
+        action_prob = Categorical(self.forward(states, critic=False)[0] * legal_action)  # (batch_size, action_space_size)
+        action = action_prob.sample()
+        log_p = action_prob.log_prob(action)
+
+        return action, log_p
+
+
+def get_ac_policy():
+    ac_net = MyACNet(
+        component={
+            "actor": config["actor_type"](**config["model"]["network"]["actor"]),
+            "critic": agent_config["critic_type"](**config["model"]["network"]["critic"])
+        },
+        optim_option={
+            "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
+            "critic": OptimOption(**config["model"]["optimization"]["critic"])
+        }
+    )
+    experience_manager = ExperienceManager(**config["experience_manager"])
+    return ActorCritic(ac_net, experience_manager, ActorCriticConfig(**config["algorithm_config"]))
diff --git a/examples/rl/vm_scheduling/agent_wrapper.py b/examples/rl/vm_scheduling/agent_wrapper.py
new file mode 100644
index 000000000..27e23788a
--- /dev/null
+++ b/examples/rl/vm_scheduling/agent_wrapper.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
+from maro.rl.learning import AgentWrapper
+
+cim_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, cim_path)
+from env_wrapper import AGENT_IDS, env_config
+from policy_index import policy_func_index
+
+
+exploration_config = {
+    "last_ep": 10,
+    "initial_value": 0.4,
+    "final_value": 0.0,
+    "splits": [(5, 0.32)]
+}
+
+def get_agent_wrapper():
+    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
+    epsilon_greedy.register_schedule(
+        scheduler_cls=MultiPhaseLinearExplorationScheduler,
+        param_name="epsilon",
+        **exploration_config
+    )
+    return AgentWrapper(
+        {name: func(learning=False) for name, func in policy_func_index.items()},
+        {name: name for name in AGENT_IDS},
+        exploration_dict={f"EpsilonGreedy": epsilon_greedy},
+        agent2exploration={name: "EpsilonGreedy" for name in AGENT_IDS}
+    )
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
new file mode 100644
index 000000000..a1d251f3e
--- /dev/null
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -0,0 +1,187 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+
+from maro.rl.learning import AbsEnvWrapper
+from maro.simulator import Env
+from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
+
+
+class VMEnvWrapper(AbsEnvWrapper):
+    def __init__(
+        self,
+        env: Env,
+        pm_attributes: list,
+        vm_attributes: list,
+        alpha: float,
+        beta: float,
+        pm_num: int,
+        durations: int,
+        vm_state_path: str,
+        vm_window_size: int = 1,
+        pm_window_size: int = 1,
+        gamma: float = 0.0,
+        reward_eval_delay: int = 0,
+        save_replay: bool = True
+    ):
+        super().__init__(env, reward_eval_delay=reward_eval_delay, save_replay=save_replay, replay_agent_ids=["AGENT"])
+        self._pm_attributes = pm_attributes
+        self._vm_attributes = vm_attributes
+        self._st = 0
+        self._static_vm_states = np.load(vm_state_path)
+        self._vm_window_size = vm_window_size
+        self._pm_window_size = pm_window_size
+
+        self._alpha, self._beta = alpha, beta # adjust the ratio of the success allocation and the total income when computing the reward
+        self._gamma = gamma # reward discount
+        self._pm_num = pm_num # the number of pms
+        self._durations = durations # the duration of the whole environment
+
+        self._pm_state_history = np.zeros((pm_window_size - 1, self._pm_num, 2))
+
+        self._state_dim = 2 * pm_num * pm_window_size + 5 * vm_window_size
+    
+    @property
+    def state_dim(self):
+        return self._state_dim
+
+    def get_state(self):
+        pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
+        # get the legal number of PM.
+        legal_pm_mask = np.zeros(self._pm_num + 1)
+        if len(self._event.valid_pms) <= 0:
+            # no pm available
+            legal_pm_mask[self._pm_num] = 1
+        else:
+            legal_pm_mask[self._pm_num] = 1
+
+            remain_cpu_dict = dict()
+            for pm in self._event.valid_pms:
+                # if two pm has same remaining cpu, only choose the one which has smaller id
+                if pm_state[-1, pm, 0] not in remain_cpu_dict:
+                    remain_cpu_dict[pm_state[-1, pm, 0]] = 1
+                    legal_pm_mask[pm] = 1
+                else:
+                    legal_pm_mask[pm] = 0
+        return {
+            "AGENT": {
+                "model": np.concatenate((pm_state.flatten(), vm_state.flatten())),
+                "legal_pm_mask": legal_pm_mask
+            }
+        }
+
+    def to_env_action(self, action_info):
+        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
+        if model_action == self._pm_num:
+            action = PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
+        else:
+            action = AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
+        return {"AGENT": action}
+
+    def get_reward(self, action_info):
+        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
+        if model_action == self._pm_num:
+            if np.sum(self._state_info["AGENT"]["legal_pm"]) != 1:
+                reward = -0.1 * self._alpha + 0.0 * self._beta
+            else:
+                reward = 0.0 * self._alpha + 0.0 * self._beta
+        else:
+            reward = (
+                1.0 * self._alpha
+                + (
+                    self._event.vm_unit_price
+                    * min(self._durations - self._event.frame_index, self._event.vm_lifetime)
+                ) * self._beta
+            )
+        return {"AGENT": reward}
+
+    def _get_pm_state(self):
+        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
+        total_pm_info = total_pm_info.reshape(self._pm_num, len(self._pm_attributes))
+
+        # normalize the attributes of pms' cpu and memory
+        self._max_cpu_capacity = np.max(total_pm_info[:, 0])
+        self._max_memory_capacity = np.max(total_pm_info[:, 1])
+        total_pm_info[:, 2] /= self._max_cpu_capacity
+        total_pm_info[:, 3] /= self._max_memory_capacity
+
+        # get the remaining cpu and memory of the pms
+        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._pm_num, 1)
+        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._pm_num, 1)
+
+        # get the pms' information
+        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, pm_num, 2)
+
+        # get the sequence pms' information
+        self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
+        return self._pm_state_history[-self._pm_window_size:, :, :].copy() # (win_size, pm_num, 2)
+
+    def _update_vm_state(self):
+        if self._vm_window_size == 1:
+            # get the vm's infomation
+            vm_info = np.array([
+                self._event.vm_cpu_cores_requirement,
+                self._event.vm_memory_requirement,
+                min(self._durations - self.env.tick, self._event.vm_lifetime) / 200,
+                (self._durations - self.env.tick) * 1.0 / 200,
+                self._event.vm_unit_price * min(self._durations - self.env.tick, self._event.vm_lifetime)
+            ], dtype=np.float)
+            vm_info[0] /= self._max_cpu_capacity
+            vm_info[1] /= self._max_memory_capacity
+            return vm_info
+        else:
+            # get the sequence vms' information
+            total_vm_info = np.zeros((self._vm_window_size, len(self._vm_attributes))))
+
+            for idx in range(self._st, self._st + self._vm_window_size):
+                if idx < self._static_vm_states.shape[0]:
+                    vm_info = self._static_vm_states[idx].copy()
+                    vm_info[0] /= self._max_cpu_capacity
+                    vm_info[1] /= self._max_memory_capacity
+                    vm_info[4] = vm_info[4] * min(self._durations - vm_info[3], vm_info[2])
+                    vm_info[2] = (vm_info[2] * 1.0) / 200
+                    vm_info[3] = (self._durations - vm_info[3]) * 1.0 / 200
+                else:
+                    vm_info = np.zeros(len(self._vm_attributes), dtype=np.float)
+
+                total_vm_info[self._vm_window_size - (idx - self._st + 1), :] = vm_info
+
+            self._st = (self._st + 1) % self._static_vm_states.shape[0]
+            return total_vm_info
+
+
+env_config = {
+    "basic": {
+        "scenario": "vm_scheduling",
+        "topology": "azure.2019.10k",
+        "start_tick": 0,
+        "durations": 8638,
+        "snapshot_resolution": 1
+    },
+    "wrapper": {
+        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"], 
+        "alpha": 0.0,
+        "beta": 1.0,
+        "pm_num": 8,
+        "durations": 200,
+        "vm_state_path": "../data/train_vm_states.npy",
+        "vm_window_size": 1,
+        "pm_window_size": 1,
+        "gamma": 0.9
+    },
+    "seed": 666
+}
+
+
+def get_env_wrapper():
+    env = Env(**env_config["basic"])
+    env.set_seed(env_config["seed"])
+    return VMEnvWrapper(env, **env_config["wrapper"]) 
+
+
+tmp_env_wrapper = get_env_wrapper()
+AGENT_IDS = tmp_env_wrapper.agent_idx_list
+STATE_DIM = tmp_env_wrapper.state_dim
+del tmp_env_wrapper
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
new file mode 100644
index 000000000..b824905dd
--- /dev/null
+++ b/examples/rl/vm_scheduling/policy_index.py
@@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+cim_path = os.path.dirname(os.path.realpath(__file__))
+if cim_path not in sys.path:
+    sys.path.insert(0, cim_path)
+from dqn import get_dqn_policy
+from env_wrapper import AGENT_IDS
+
+# use agent IDs as policy names since each agent uses a separate policy
+update_trigger = {name: 128 for name in AGENT_IDS}
+warmup = {name: 1 for name in AGENT_IDS}
+policy_func_index = {name: get_dqn_policy for name in AGENT_IDS}
diff --git a/maro/rl/learning/env_wrapper.py b/maro/rl/learning/env_wrapper.py
index 22edd4b69..74cfcd2f6 100644
--- a/maro/rl/learning/env_wrapper.py
+++ b/maro/rl/learning/env_wrapper.py
@@ -23,7 +23,6 @@ class AbsEnvWrapper(ABC):
     """
     def __init__(self, env: Env, reward_eval_delay: int = 0, save_replay: bool = True, replay_agent_ids: list = None):
         self.env = env
-        self.state_info = None  # context for converting model output to actions that can be executed by the env
         self.reward_eval_delay = reward_eval_delay
         self.action_history = defaultdict(dict)
         self.save_replay = save_replay
@@ -176,7 +175,6 @@ def get_experiences(self):
 
     def reset(self):
         self.env.reset()
-        self.state_info = None
         self._total_reward.clear()
         self._state = None
         self._pending_reward_cache.clear()

From b7530c193de74f513e1b3863c9d459a460f96794 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 11:11:48 +0000
Subject: [PATCH 02/29] refined vm env_wrapper code style

---
 examples/rl/vm_scheduling/agent_wrapper.py | 35 ----------------------
 examples/rl/vm_scheduling/env_wrapper.py   | 22 +++++---------
 examples/rl/vm_scheduling/policy_index.py  | 34 +++++++++++++++++++--
 3 files changed, 39 insertions(+), 52 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/agent_wrapper.py

diff --git a/examples/rl/vm_scheduling/agent_wrapper.py b/examples/rl/vm_scheduling/agent_wrapper.py
deleted file mode 100644
index 27e23788a..000000000
--- a/examples/rl/vm_scheduling/agent_wrapper.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
-from maro.rl.learning import AgentWrapper
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, cim_path)
-from env_wrapper import AGENT_IDS, env_config
-from policy_index import policy_func_index
-
-
-exploration_config = {
-    "last_ep": 10,
-    "initial_value": 0.4,
-    "final_value": 0.0,
-    "splits": [(5, 0.32)]
-}
-
-def get_agent_wrapper():
-    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
-    epsilon_greedy.register_schedule(
-        scheduler_cls=MultiPhaseLinearExplorationScheduler,
-        param_name="epsilon",
-        **exploration_config
-    )
-    return AgentWrapper(
-        {name: func(learning=False) for name, func in policy_func_index.items()},
-        {name: name for name in AGENT_IDS},
-        exploration_dict={f"EpsilonGreedy": epsilon_greedy},
-        agent2exploration={name: "EpsilonGreedy" for name in AGENT_IDS}
-    )
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index a1d251f3e..f2c19c2d7 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -32,14 +32,13 @@ def __init__(
         self._static_vm_states = np.load(vm_state_path)
         self._vm_window_size = vm_window_size
         self._pm_window_size = pm_window_size
-
-        self._alpha, self._beta = alpha, beta # adjust the ratio of the success allocation and the total income when computing the reward
+        # adjust the ratio of the success allocation and the total income when computing the reward
+        self._alpha = alpha
+        self._beta = beta
         self._gamma = gamma # reward discount
         self._pm_num = pm_num # the number of pms
         self._durations = durations # the duration of the whole environment
-
         self._pm_state_history = np.zeros((pm_window_size - 1, self._pm_num, 2))
-
         self._state_dim = 2 * pm_num * pm_window_size + 5 * vm_window_size
     
     @property
@@ -55,7 +54,6 @@ def get_state(self):
             legal_pm_mask[self._pm_num] = 1
         else:
             legal_pm_mask[self._pm_num] = 1
-
             remain_cpu_dict = dict()
             for pm in self._event.valid_pms:
                 # if two pm has same remaining cpu, only choose the one which has smaller id
@@ -88,11 +86,8 @@ def get_reward(self, action_info):
                 reward = 0.0 * self._alpha + 0.0 * self._beta
         else:
             reward = (
-                1.0 * self._alpha
-                + (
-                    self._event.vm_unit_price
-                    * min(self._durations - self._event.frame_index, self._event.vm_lifetime)
-                ) * self._beta
+                1.0 * self._alpha + self._beta * self._event.vm_unit_price *
+                min(self._durations - self._event.frame_index, self._event.vm_lifetime)
             )
         return {"AGENT": reward}
 
@@ -117,7 +112,7 @@ def _get_pm_state(self):
         self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
         return self._pm_state_history[-self._pm_window_size:, :, :].copy() # (win_size, pm_num, 2)
 
-    def _update_vm_state(self):
+    def _get_vm_state(self):
         if self._vm_window_size == 1:
             # get the vm's infomation
             vm_info = np.array([
@@ -132,8 +127,7 @@ def _update_vm_state(self):
             return vm_info
         else:
             # get the sequence vms' information
-            total_vm_info = np.zeros((self._vm_window_size, len(self._vm_attributes))))
-
+            total_vm_info = np.zeros((self._vm_window_size, len(self._vm_attributes)))
             for idx in range(self._st, self._st + self._vm_window_size):
                 if idx < self._static_vm_states.shape[0]:
                     vm_info = self._static_vm_states[idx].copy()
@@ -174,13 +168,11 @@ def _update_vm_state(self):
     "seed": 666
 }
 
-
 def get_env_wrapper():
     env = Env(**env_config["basic"])
     env.set_seed(env_config["seed"])
     return VMEnvWrapper(env, **env_config["wrapper"]) 
 
-
 tmp_env_wrapper = get_env_wrapper()
 AGENT_IDS = tmp_env_wrapper.agent_idx_list
 STATE_DIM = tmp_env_wrapper.state_dim
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
index b824905dd..470d9c196 100644
--- a/examples/rl/vm_scheduling/policy_index.py
+++ b/examples/rl/vm_scheduling/policy_index.py
@@ -4,13 +4,43 @@
 import os
 import sys
 
+from maro.rl.exploration import EpsilonGreedyExploration
+
+
 cim_path = os.path.dirname(os.path.realpath(__file__))
 if cim_path not in sys.path:
     sys.path.insert(0, cim_path)
 from dqn import get_dqn_policy
-from env_wrapper import AGENT_IDS
+from env_wrapper import AGENT_IDS, env_config
 
-# use agent IDs as policy names since each agent uses a separate policy
 update_trigger = {name: 128 for name in AGENT_IDS}
 warmup = {name: 1 for name in AGENT_IDS}
+
+# use agent IDs as policy names since each agent uses a separate policy
 policy_func_index = {name: get_dqn_policy for name in AGENT_IDS}
+agent2policy = {name: name for name in AGENT_IDS}
+
+
+class VMExploration(EpsilonGreedyExploration):
+    def __call__(self, action_index: Union[int, np.ndarray], legal_action: np.ndarray):
+        if isinstance(action_index, np.ndarray):
+            return np.array([self._get_exploration_action(act) for act in action_index])
+        else:
+            return self._get_exploration_action(action_index, legal_action)
+
+    def _get_exploration_action(self, action_index: int, legal_action: np.ndarray):
+        assert (action_index < self._num_actions), f"Invalid action: {action_index}"
+        return action_index if np.random.random() > self.epsilon else np.random.choice(np.where(legal_action == 1)[0])
+
+
+"""
+def __call__(self, action_index: Union[int, np.ndarray]):
+    if isinstance(action_index, np.ndarray):
+        return np.array([self._get_exploration_action(act) for act in action_index])
+    else:
+        return self._get_exploration_action(action_index)
+
+def _get_exploration_action(self, action_index):
+    assert (action_index < self._num_actions), f"Invalid action: {action_index}"
+    return action_index if np.random.random() > self.epsilon else np.random.choice(self._num_actions)
+"""
\ No newline at end of file

From c9bb66fa60090b3e4d5757ec81f75abb8aed231f Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 12:40:59 +0000
Subject: [PATCH 03/29] added DQN

---
 examples/rl/cim/dqn.py                    |  2 +-
 examples/rl/vm_scheduling/__init__.py     | 10 +++--
 examples/rl/vm_scheduling/ac.py           | 46 ++++++++++-------------
 examples/rl/vm_scheduling/env_wrapper.py  |  8 +---
 examples/rl/vm_scheduling/policy_index.py | 34 ++++++++++++++---
 5 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/examples/rl/cim/dqn.py b/examples/rl/cim/dqn.py
index 21f17264e..6b536bdc2 100644
--- a/examples/rl/cim/dqn.py
+++ b/examples/rl/cim/dqn.py
@@ -56,7 +56,7 @@
             "beta": 0.4,
             "beta_step": 0.001
         }
-    }    
+    }
 }
 
 
diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
index cddfbf182..b3d9785e3 100644
--- a/examples/rl/vm_scheduling/__init__.py
+++ b/examples/rl/vm_scheduling/__init__.py
@@ -1,8 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from .agent_wrapper import get_agent_wrapper
 from .env_wrapper import get_env_wrapper
-from .policy_index import policy_func_index, update_trigger, warmup
+from .policy_index import (
+    agent2exploration, agent2policy, exploration_func_index, policy_func_index, update_trigger, warmup
+)
 
-__all__ = ["get_agent_wrapper", "get_env_wrapper", "policy_func_index", "update_trigger", "warmup"]
+__all__ = [
+    "agent2exploration", "agent2policy", "exploration_func_index", "get_env_wrapper", "policy_func_index",
+    "update_trigger", "warmup"
+]
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 0e9e8704b..53c8ea554 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -47,6 +47,19 @@
                 "optim_params": {"lr": 0.001}
             }
         }
+    },
+    "algorithm": {
+        "reward_discount": 0.9,
+        "train_epochs": 100,
+        "gradient_iters": 1,
+        "critic_loss_cls": "mse",
+        "actor_loss_coefficient": 0.1
+    },
+    "experience_manager": {
+        "capacity": 10000,
+        "overwrite_type": "rolling",
+        "batch_size": -1,
+        "replace": False
     }
 }
 
@@ -54,39 +67,20 @@
 class MyACNet(DiscreteACNet):
     def forward(self, states, actor: bool = True, critic: bool = True):
         inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
-        
-        if len(states.shape) == 1:
-            states = states.unsqueeze(dim=0)
+        masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
+        if len(inputs.shape) == 1:
+            inputs = inputs.unsqueeze(dim=0)
         return (
-            self.component["actor"](inputs) if actor else None,
+            self.component["actor"](inputs) * masks if actor else None,
             self.component["critic"](inputs) if critic else None
         )
 
-    def get_action(self, states, training=True):
-        """
-        Given Q-values for a batch of states, return the action index and the corresponding maximum Q-value
-        for each state.
-        """
-        states, legal_action = states
-        legal_action = torch.from_numpy(np.asarray(legal_action)).to(self.device)
-
-        if not training:
-            action_prob = self.forward(states, critic=False)[0]
-            _, action = (action_prob + (legal_action - 1) * 1e8).max(dim=1)
-            return action, action_prob
-
-        action_prob = Categorical(self.forward(states, critic=False)[0] * legal_action)  # (batch_size, action_space_size)
-        action = action_prob.sample()
-        log_p = action_prob.log_prob(action)
-
-        return action, log_p
-
 
 def get_ac_policy():
     ac_net = MyACNet(
         component={
-            "actor": config["actor_type"](**config["model"]["network"]["actor"]),
-            "critic": agent_config["critic_type"](**config["model"]["network"]["critic"])
+            "actor": FullyConnectedBlock(**config["model"]["network"]["actor"]),
+            "critic": FullyConnectedBlock(**config["model"]["network"]["critic"])
         },
         optim_option={
             "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
@@ -94,4 +88,4 @@ def get_ac_policy():
         }
     )
     experience_manager = ExperienceManager(**config["experience_manager"])
-    return ActorCritic(ac_net, experience_manager, ActorCriticConfig(**config["algorithm_config"]))
+    return ActorCritic(ac_net, experience_manager, ActorCriticConfig(**config["algorithm"]))
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index f2c19c2d7..4b5b275f6 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -62,12 +62,8 @@ def get_state(self):
                     legal_pm_mask[pm] = 1
                 else:
                     legal_pm_mask[pm] = 0
-        return {
-            "AGENT": {
-                "model": np.concatenate((pm_state.flatten(), vm_state.flatten())),
-                "legal_pm_mask": legal_pm_mask
-            }
-        }
+
+        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
 
     def to_env_action(self, action_info):
         model_action = action_info[0] if isinstance(action_info, tuple) else action_info
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
index 470d9c196..e1d2d54df 100644
--- a/examples/rl/vm_scheduling/policy_index.py
+++ b/examples/rl/vm_scheduling/policy_index.py
@@ -4,31 +4,33 @@
 import os
 import sys
 
-from maro.rl.exploration import EpsilonGreedyExploration
+import numpy as np
+
+from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
 
 
 cim_path = os.path.dirname(os.path.realpath(__file__))
 if cim_path not in sys.path:
     sys.path.insert(0, cim_path)
-from dqn import get_dqn_policy
+from ac import get_ac_policy
 from env_wrapper import AGENT_IDS, env_config
 
 update_trigger = {name: 128 for name in AGENT_IDS}
 warmup = {name: 1 for name in AGENT_IDS}
 
 # use agent IDs as policy names since each agent uses a separate policy
-policy_func_index = {name: get_dqn_policy for name in AGENT_IDS}
+policy_func_index = {name: get_ac_policy for name in AGENT_IDS}
 agent2policy = {name: name for name in AGENT_IDS}
 
 
 class VMExploration(EpsilonGreedyExploration):
-    def __call__(self, action_index: Union[int, np.ndarray], legal_action: np.ndarray):
+    def __call__(self, action_index, legal_action):
         if isinstance(action_index, np.ndarray):
             return np.array([self._get_exploration_action(act) for act in action_index])
         else:
             return self._get_exploration_action(action_index, legal_action)
 
-    def _get_exploration_action(self, action_index: int, legal_action: np.ndarray):
+    def _get_exploration_action(self, action_index, legal_action):
         assert (action_index < self._num_actions), f"Invalid action: {action_index}"
         return action_index if np.random.random() > self.epsilon else np.random.choice(np.where(legal_action == 1)[0])
 
@@ -43,4 +45,24 @@ def __call__(self, action_index: Union[int, np.ndarray]):
 def _get_exploration_action(self, action_index):
     assert (action_index < self._num_actions), f"Invalid action: {action_index}"
     return action_index if np.random.random() > self.epsilon else np.random.choice(self._num_actions)
-"""
\ No newline at end of file
+"""
+
+exploration_config = {
+    "last_ep": 400,
+    "initial_value": 0.4,
+    "final_value": 0.0,
+    "splits": [[100, 0.32]]
+}
+
+def get_exploration():
+    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
+    epsilon_greedy.register_schedule(
+        scheduler_cls=MultiPhaseLinearExplorationScheduler,
+        param_name="epsilon",
+        **exploration_config
+    )
+    return epsilon_greedy
+
+
+exploration_func_index = {f"EpsilonGreedy": get_exploration}
+agent2exploration = {name: "EpsilonGreedy" for name in AGENT_IDS}

From 4e1a8b1debfc9492f65163f18ec8861ce917df7d Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Tue, 13 Jul 2021 02:34:19 +0000
Subject: [PATCH 04/29] added get_experiences func for ac in vm scheduling

---
 examples/rl/vm_scheduling/ac.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 53c8ea554..942624ee2 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -5,9 +5,10 @@
 import sys
 
 import numpy as np
+import scipy
 import torch
 
-from maro.rl.experience import ExperienceManager
+from maro.rl.experience import ExperienceManager, ExperienceSet
 from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
 from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
 
@@ -89,3 +90,30 @@ def get_ac_policy():
     )
     experience_manager = ExperienceManager(**config["experience_manager"])
     return ActorCritic(ac_net, experience_manager, ActorCriticConfig(**config["algorithm"]))
+
+
+def get_ac_experiences(replay_buffer):
+    def discount_cumsum(x, discount):
+        """
+        magic from rllab for computing discounted cumulative sums of vectors.
+        
+        Reference: https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/core.py
+        """
+        return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
+
+    rewards = np.array(replay_buffer["rewards"])
+    cumsum_rewards = discount_cumsum(rewards, self._gamma)
+
+    exp_set = ExperienceSet(
+        replay_buffer["states"][:-1],
+        replay_buffer["actions"][:-1],
+        cumsum_rewards[:-1],
+        replay_buffer["states"][1:],
+        replay_buffer["info"][1:],
+    )
+    del replay_buffer["states"][:-1]
+    del replay_buffer["actions"][:-1]
+    del replay_buffer["rewards"][:-1]
+    del replay_buffer["info"][:-1]
+
+    return exp_set

From c52085a67102701316be409ede2758a69d07ac7d Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Thu, 15 Jul 2021 07:46:42 +0000
Subject: [PATCH 05/29] added post_step callback to env wrapper

---
 examples/rl/vm_scheduling/ac.py          | 33 +++++++++++++++---------
 examples/rl/vm_scheduling/env_wrapper.py | 12 ++++++++-
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 942624ee2..948a445dd 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -8,7 +8,7 @@
 import scipy
 import torch
 
-from maro.rl.experience import ExperienceManager, ExperienceSet
+from maro.rl.experience import ExperienceSet, ExperienceStore, UniformSampler
 from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
 from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
 
@@ -56,11 +56,15 @@
         "critic_loss_cls": "mse",
         "actor_loss_coefficient": 0.1
     },
-    "experience_manager": {
+    "experience_store": {
         "capacity": 10000,
         "overwrite_type": "rolling",
         "batch_size": -1,
         "replace": False
+    },
+    "sampler": {
+        "rollout": {"batch_size": -1, "replace": False},
+        "update": {"batch_size": 128, "replace": True}
     }
 }
 
@@ -77,7 +81,7 @@ def forward(self, states, actor: bool = True, critic: bool = True):
         )
 
 
-def get_ac_policy():
+def get_ac_policy(mode="update"):
     ac_net = MyACNet(
         component={
             "actor": FullyConnectedBlock(**config["model"]["network"]["actor"]),
@@ -86,10 +90,20 @@ def get_ac_policy():
         optim_option={
             "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
             "critic": OptimOption(**config["model"]["optimization"]["critic"])
-        }
+        } if mode != "inference" else None
+    )
+    if mode == "update":
+        exp_store = ExperienceStore(**config["experience_store"]["update"])
+        experience_sampler_kwargs = config["sampler"]["update"]
+    else:
+        exp_store = ExperienceStore(**config["experience_store"]["rollout" if mode == "inference" else "update"])
+        experience_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
+
+    return ActorCritic(
+        ac_net, ActorCriticConfig(**config["algorithm"]), exp_store,
+        experience_sampler_cls=UniformSampler,
+        experience_sampler_kwargs=experience_sampler_kwargs
     )
-    experience_manager = ExperienceManager(**config["experience_manager"])
-    return ActorCritic(ac_net, experience_manager, ActorCriticConfig(**config["algorithm"]))
 
 
 def get_ac_experiences(replay_buffer):
@@ -102,7 +116,7 @@ def discount_cumsum(x, discount):
         return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 
     rewards = np.array(replay_buffer["rewards"])
-    cumsum_rewards = discount_cumsum(rewards, self._gamma)
+    cumsum_rewards = discount_cumsum(rewards, config["algorithm"]["reward_discount"])
 
     exp_set = ExperienceSet(
         replay_buffer["states"][:-1],
@@ -111,9 +125,4 @@ def discount_cumsum(x, discount):
         replay_buffer["states"][1:],
         replay_buffer["info"][1:],
     )
-    del replay_buffer["states"][:-1]
-    del replay_buffer["actions"][:-1]
-    del replay_buffer["rewards"][:-1]
-    del replay_buffer["info"][:-1]
-
     return exp_set
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index 4b5b275f6..3374bdea0 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -3,10 +3,20 @@
 
 import numpy as np
 
-from maro.rl.learning import AbsEnvWrapper
+from maro.rl.learning import AbsEnvWrapper, Transition
 from maro.simulator import Env
 from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
 
+def post_step(env: Env, tracker: dict, transition: Transition):
+    tracker["env_metric"] = env.metrics
+    if "vm_cpu_cores_requirement" not in tracker:
+        tracker["vm_cpu_cores_requirement"] = []
+    if "action_sequence" not in tracker:
+        tracker["action_sequence"] = []
+
+    tracker["vm_cpu_cores_requirement"].append([transition.action, transition.state["mask"]])
+    tracker["action_sequence"].append(transition.action)
+
 
 class VMEnvWrapper(AbsEnvWrapper):
     def __init__(

From f324d912845a96ea08497fd44a1a6e86c4631264 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Thu, 15 Jul 2021 08:02:19 +0000
Subject: [PATCH 06/29] moved Aiming's tracking and plotting logic into
 callbacks

---
 examples/rl/vm_scheduling/ac.py          | 29 ++++++++++++------------
 examples/rl/vm_scheduling/env_wrapper.py | 11 ++++-----
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 948a445dd..1dc260f78 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -69,19 +69,18 @@
 }
 
 
-class MyACNet(DiscreteACNet):
-    def forward(self, states, actor: bool = True, critic: bool = True):
-        inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
-        masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
-        if len(inputs.shape) == 1:
-            inputs = inputs.unsqueeze(dim=0)
-        return (
-            self.component["actor"](inputs) * masks if actor else None,
-            self.component["critic"](inputs) if critic else None
-        )
-
-
 def get_ac_policy(mode="update"):
+    class MyACNet(DiscreteACNet):
+        def forward(self, states, actor: bool = True, critic: bool = True):
+            inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
+            masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
+            if len(inputs.shape) == 1:
+                inputs = inputs.unsqueeze(dim=0)
+            return (
+                self.component["actor"](inputs) * masks if actor else None,
+                self.component["critic"](inputs) if critic else None
+            )
+
     ac_net = MyACNet(
         component={
             "actor": FullyConnectedBlock(**config["model"]["network"]["actor"]),
@@ -94,15 +93,15 @@ def get_ac_policy(mode="update"):
     )
     if mode == "update":
         exp_store = ExperienceStore(**config["experience_store"]["update"])
-        experience_sampler_kwargs = config["sampler"]["update"]
+        exp_sampler_kwargs = config["sampler"]["update"]
     else:
         exp_store = ExperienceStore(**config["experience_store"]["rollout" if mode == "inference" else "update"])
-        experience_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
+        exp_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
 
     return ActorCritic(
         ac_net, ActorCriticConfig(**config["algorithm"]), exp_store,
         experience_sampler_cls=UniformSampler,
-        experience_sampler_kwargs=experience_sampler_kwargs
+        experience_sampler_kwargs=exp_sampler_kwargs
     )
 
 
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index 3374bdea0..a888898dc 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -10,11 +10,11 @@
 def post_step(env: Env, tracker: dict, transition: Transition):
     tracker["env_metric"] = env.metrics
     if "vm_cpu_cores_requirement" not in tracker:
-        tracker["vm_cpu_cores_requirement"] = []
+        tracker["vm_core_requirement"] = []
     if "action_sequence" not in tracker:
         tracker["action_sequence"] = []
 
-    tracker["vm_cpu_cores_requirement"].append([transition.action, transition.state["mask"]])
+    tracker["vm_core_requirement"].append([transition.action, transition.state["mask"]])
     tracker["action_sequence"].append(transition.action)
 
 
@@ -32,10 +32,9 @@ def __init__(
         vm_window_size: int = 1,
         pm_window_size: int = 1,
         gamma: float = 0.0,
-        reward_eval_delay: int = 0,
-        save_replay: bool = True
+        reward_eval_delay: int = 0
     ):
-        super().__init__(env, reward_eval_delay=reward_eval_delay, save_replay=save_replay, replay_agent_ids=["AGENT"])
+        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
         self._pm_attributes = pm_attributes
         self._vm_attributes = vm_attributes
         self._st = 0
@@ -50,7 +49,7 @@ def __init__(
         self._durations = durations # the duration of the whole environment
         self._pm_state_history = np.zeros((pm_window_size - 1, self._pm_num, 2))
         self._state_dim = 2 * pm_num * pm_window_size + 5 * vm_window_size
-    
+
     @property
     def state_dim(self):
         return self._state_dim

From 272d2cc92c6e816958a087fc2239d99ab7feeeb9 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 16 Jul 2021 05:19:52 +0000
Subject: [PATCH 07/29] added eval env wrapper

---
 examples/rl/vm_scheduling/env_wrapper.py  | 33 ++++++++++++-
 examples/rl/vm_scheduling/policy_index.py | 59 ++---------------------
 2 files changed, 36 insertions(+), 56 deletions(-)

diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index a888898dc..5aa054cdf 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -173,10 +173,39 @@ def _get_vm_state(self):
     "seed": 666
 }
 
-def get_env_wrapper():
+
+eval_env_config = {
+    "basic": {
+        "scenario": "vm_scheduling",
+        "topology": "azure.2019.10k.short.test",
+        "start_tick": 0,
+        "durations": 300,
+        "snapshot_resolution": 1
+    },
+    "wrapper": {
+        "alpha": 0.0,
+        "beta": 1.0,
+        "pm_num": 8,
+        "durations": 200,
+        "vm_state_path": "../data/test_vm_states.npy",
+        "vm_window_size": 1,
+        "pm_window_size": 1,
+        "gamma": 0.9
+    }
+}
+
+
+def get_env_wrapper(replay_agent_ids=None):
     env = Env(**env_config["basic"])
     env.set_seed(env_config["seed"])
-    return VMEnvWrapper(env, **env_config["wrapper"]) 
+    return VMEnvWrapper(env, **env_config["wrapper"])
+
+
+def get_eval_env_wrapper():
+    eval_env = Env(**eval_env_config["basic"])
+    eval_env.set_seed(eval_env_config["seed"])
+    return VMEnvWrapper(eval_env, **eval_env_config["wrapper"])
+
 
 tmp_env_wrapper = get_env_wrapper()
 AGENT_IDS = tmp_env_wrapper.agent_idx_list
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
index e1d2d54df..376e0c2f3 100644
--- a/examples/rl/vm_scheduling/policy_index.py
+++ b/examples/rl/vm_scheduling/policy_index.py
@@ -4,16 +4,12 @@
 import os
 import sys
 
-import numpy as np
-
-from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
-
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-if cim_path not in sys.path:
-    sys.path.insert(0, cim_path)
+vm_path = os.path.dirname(os.path.realpath(__file__))
+if vm_path not in sys.path:
+    sys.path.insert(0, vm_path)
 from ac import get_ac_policy
-from env_wrapper import AGENT_IDS, env_config
+from dqn import get_dqn_policy
+from env_wrapper import AGENT_IDS
 
 update_trigger = {name: 128 for name in AGENT_IDS}
 warmup = {name: 1 for name in AGENT_IDS}
@@ -21,48 +17,3 @@
 # use agent IDs as policy names since each agent uses a separate policy
 policy_func_index = {name: get_ac_policy for name in AGENT_IDS}
 agent2policy = {name: name for name in AGENT_IDS}
-
-
-class VMExploration(EpsilonGreedyExploration):
-    def __call__(self, action_index, legal_action):
-        if isinstance(action_index, np.ndarray):
-            return np.array([self._get_exploration_action(act) for act in action_index])
-        else:
-            return self._get_exploration_action(action_index, legal_action)
-
-    def _get_exploration_action(self, action_index, legal_action):
-        assert (action_index < self._num_actions), f"Invalid action: {action_index}"
-        return action_index if np.random.random() > self.epsilon else np.random.choice(np.where(legal_action == 1)[0])
-
-
-"""
-def __call__(self, action_index: Union[int, np.ndarray]):
-    if isinstance(action_index, np.ndarray):
-        return np.array([self._get_exploration_action(act) for act in action_index])
-    else:
-        return self._get_exploration_action(action_index)
-
-def _get_exploration_action(self, action_index):
-    assert (action_index < self._num_actions), f"Invalid action: {action_index}"
-    return action_index if np.random.random() > self.epsilon else np.random.choice(self._num_actions)
-"""
-
-exploration_config = {
-    "last_ep": 400,
-    "initial_value": 0.4,
-    "final_value": 0.0,
-    "splits": [[100, 0.32]]
-}
-
-def get_exploration():
-    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
-    epsilon_greedy.register_schedule(
-        scheduler_cls=MultiPhaseLinearExplorationScheduler,
-        param_name="epsilon",
-        **exploration_config
-    )
-    return epsilon_greedy
-
-
-exploration_func_index = {f"EpsilonGreedy": get_exploration}
-agent2exploration = {name: "EpsilonGreedy" for name in AGENT_IDS}

From c51a7b64df65ecee68485fd6ba2d27d76b66e2b9 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Mon, 19 Jul 2021 05:56:45 +0000
Subject: [PATCH 08/29] renamed AC config variable name for VM

---
 examples/rl/vm_scheduling/ac.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 1dc260f78..85268de77 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -54,7 +54,7 @@
         "train_epochs": 100,
         "gradient_iters": 1,
         "critic_loss_cls": "mse",
-        "actor_loss_coefficient": 0.1
+        "critic_loss_coeff": 0.1
     },
     "experience_store": {
         "capacity": 10000,

From 74fc932cb2e4a41312fa4ccac418998733b20061 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Mon, 19 Jul 2021 08:29:11 +0000
Subject: [PATCH 09/29] vm scheduling RL code finished

---
 examples/rl/vm_scheduling/ac.py        |  25 +-----
 examples/rl/vm_scheduling/callbacks.py |  88 ++++++++++++++++++
 examples/rl/vm_scheduling/dqn.py       | 119 +++++++++++++++++++++++++
 3 files changed, 208 insertions(+), 24 deletions(-)
 create mode 100644 examples/rl/vm_scheduling/callbacks.py
 create mode 100644 examples/rl/vm_scheduling/dqn.py

diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 85268de77..9b96ea46d 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -5,10 +5,9 @@
 import sys
 
 import numpy as np
-import scipy
 import torch
 
-from maro.rl.experience import ExperienceSet, ExperienceStore, UniformSampler
+from maro.rl.experience import ExperienceStore, UniformSampler
 from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
 from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
 
@@ -103,25 +102,3 @@ def forward(self, states, actor: bool = True, critic: bool = True):
         experience_sampler_cls=UniformSampler,
         experience_sampler_kwargs=exp_sampler_kwargs
     )
-
-
-def get_ac_experiences(replay_buffer):
-    def discount_cumsum(x, discount):
-        """
-        magic from rllab for computing discounted cumulative sums of vectors.
-        
-        Reference: https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/core.py
-        """
-        return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
-
-    rewards = np.array(replay_buffer["rewards"])
-    cumsum_rewards = discount_cumsum(rewards, config["algorithm"]["reward_discount"])
-
-    exp_set = ExperienceSet(
-        replay_buffer["states"][:-1],
-        replay_buffer["actions"][:-1],
-        cumsum_rewards[:-1],
-        replay_buffer["states"][1:],
-        replay_buffer["info"][1:],
-    )
-    return exp_set
diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
new file mode 100644
index 000000000..3cadaf46f
--- /dev/null
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import time
+from os import makedirs
+from os.path import dirname, join, realpath
+
+import matplotlib.pyplot as plt
+
+from maro.utils import Logger
+
+timestamp = str(time.time())
+
+log_dir = join(dirname(realpath(__file__)), "log", timestamp)
+makedirs(log_dir, exist_ok=True)
+
+plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
+makedirs(plt_path, exist_ok=True)
+
+
+simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
+
+def post_collect(trackers, ep, segment):
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+
+
+def post_evaluate(trackers, ep):
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+
+    for i, tracker in enumerate(trackers):
+        core_requirement = tracker["vm_core_requirement"]
+        action_sequence = tracker["action_sequence"]
+        # plot action sequence
+        fig = plt.figure(figsize=(40, 32))
+        ax = fig.add_subplot(1, 1, 1)
+        ax.plot(action_sequence)
+        fig.savefig(f"{plt_path}/action_sequence_{ep}")
+        plt.cla()
+        plt.close("all")
+
+        # plot with legal action mask
+        fig = plt.figure(figsize=(40, 32))
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
+
+        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
+
+        # plot without legal actin mask
+        fig = plt.figure(figsize=(40, 32))
+
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0])
+
+        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
diff --git a/examples/rl/vm_scheduling/dqn.py b/examples/rl/vm_scheduling/dqn.py
new file mode 100644
index 000000000..4350f8d48
--- /dev/null
+++ b/examples/rl/vm_scheduling/dqn.py
@@ -0,0 +1,119 @@
+  
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+import numpy as np
+import torch
+
+from maro.rl.experience import ExperienceStore, UniformSampler
+from maro.rl.exploration import DiscreteSpaceExploration, MultiPhaseLinearExplorationScheduler
+from maro.rl.model import DiscreteQNet, FullyConnectedBlock, OptimOption
+from maro.rl.policy.algorithms import DQN, DQNConfig
+
+vm_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, vm_path)
+from env_wrapper import STATE_DIM
+
+config = {
+    "model": {
+        "network": {
+            "input_dim": STATE_DIM,
+            "hidden_dims": [64, 128, 256],
+            "output_dim": 9,
+            "activation": "leaky_relu",
+            "softmax": False,
+            "batch_norm": False,
+            "skip_connection": False,
+            "head": True,
+            "dropout_p": 0.0
+        },
+        "optimization": {
+            "optim_cls": "sgd",
+            "optim_params": {"lr": 0.0005},
+            "scheduler_cls": "cosine_annealing_warm_restarts",
+            "scheduler_params": {"T_0": 500, "T_mult": 2}
+        }
+    },
+    "algorithm": {
+        "reward_discount": 0.9,
+        "target_update_freq": 5,
+        "train_epochs": 100,
+        "soft_update_coeff": 0.1,
+        "double": False
+    },
+    "experience_store": {
+        "rollout": {"capacity": 1000, "overwrite_type": "rolling"},
+        "update": {"capacity": 10000, "overwrite_type": "rolling"}
+    },
+    "sampler": {
+        "rollout": {"batch_size": -1, "replace": False},
+        "update": {"batch_size": 256, "replace": True}
+    },
+    "exploration": {
+        "last_ep": 400,
+        "initial_value": 0.4,
+        "final_value": 0.0,
+        "splits": [(100, 0.32)]
+    }
+}
+
+
+class MyQNet(DiscreteQNet):
+    def __init__(self, component, optim_option, device: str = None):
+        super().__init__(component, optim_option=optim_option, device=device)
+        for mdl in self.modules():
+            if isinstance(mdl, torch.nn.Linear):
+                torch.nn.init.xavier_uniform_(mdl.weight, gain=torch.nn.init.calculate_gain('leaky_relu'))
+
+    def forward(self, states):
+        inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
+        masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
+        if len(inputs.shape) == 1:
+            inputs = inputs.unsqueeze(dim=0)
+        q_for_all_actions = self.component(inputs)
+        return q_for_all_actions + (masks - 1) * 1e8
+
+
+class MaskedEpsilonGreedy(DiscreteSpaceExploration):
+    def __init__(self, epsilon: float = .0):
+        super().__init__()
+        self.epsilon = epsilon
+
+    def __call__(self, action, state):
+        mask = [st["mask"] for st in state]
+        return np.array(
+            [act if np.random.random() > self.epsilon else np.random.choice(np.where(mask == 1)[0]) for act in action]
+        )
+
+
+def get_dqn_policy(mode="update"):
+    assert mode in {"inference", "update", "inference-update"}
+    q_net = MyQNet(
+        FullyConnectedBlock(**config["model"]["network"]),
+        optim_option=OptimOption(**config["model"]["optimization"]) if mode != "inference" else None
+    )
+
+    if mode == "update":
+        exp_store = ExperienceStore(**config["experience_store"]["update"])
+        exploration = None
+        exp_sampler_kwargs = config["sampler"]["update"]
+    else:
+        exp_store = ExperienceStore(**config["experience_store"]["rollout"])
+        exploration = MaskedEpsilonGreedy()
+        exploration.register_schedule(
+            scheduler_cls=MultiPhaseLinearExplorationScheduler,
+            param_name="epsilon",
+            **config["exploration"]
+        )
+        exp_store = ExperienceStore(**config["experience_store"]["rollout" if mode == "inference" else "update"])
+        exp_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
+
+    return DQN(
+        q_net, DQNConfig(**config["algorithm"]), exp_store,
+        experience_sampler_cls=UniformSampler,
+        experience_sampler_kwargs=exp_sampler_kwargs,
+        exploration=exploration
+    )

From 9e9da67196381f8038091c68db23c0ae67f740c4 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Mon, 19 Jul 2021 08:44:27 +0000
Subject: [PATCH 10/29] updated README

---
 examples/rl/vm_scheduling/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/rl/vm_scheduling/README.md b/examples/rl/vm_scheduling/README.md
index 2d280f476..a428a5c33 100644
--- a/examples/rl/vm_scheduling/README.md
+++ b/examples/rl/vm_scheduling/README.md
@@ -1,6 +1,6 @@
-# Container Inventory Management
+# Virtual Machine Scheduling
 
-Container inventory management (CIM) is a scenario where reinforcement learning (RL) can potentially prove useful. In this folder you can find:
+Virtual Machine (VM) scheduling is a scenario where reinforcement learning (RL) can help the virtual machine allocator allocate compute resources intelligently. In this folder you can find:
 * ``env_wrapper.py``, which contains a function to generate an environment wrapper to interact
 with our "agent" (see below);
 * ``agent_wrapper.py``, which contains a function to generate an agent wrapper to interact

From 453ec15c4bf4e1148bc6176dab37e6b6bc502864 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Thu, 22 Jul 2021 06:59:58 +0000
Subject: [PATCH 11/29] fixed various bugs and hard coding for vm_scheduling

---
 examples/rl/vm_scheduling/__init__.py     |  13 +--
 examples/rl/vm_scheduling/ac.py           |  13 +--
 examples/rl/vm_scheduling/callbacks.py    | 127 ++++++++++-----------
 examples/rl/vm_scheduling/dqn.py          |  21 ++--
 examples/rl/vm_scheduling/env_wrapper.py  | 130 +++++++++-------------
 examples/rl/vm_scheduling/policy_index.py |  16 +--
 examples/rl/workflows/config.yml          |   4 +-
 7 files changed, 153 insertions(+), 171 deletions(-)

diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
index b3d9785e3..3055ab873 100644
--- a/examples/rl/vm_scheduling/__init__.py
+++ b/examples/rl/vm_scheduling/__init__.py
@@ -1,12 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from .env_wrapper import get_env_wrapper
-from .policy_index import (
-    agent2exploration, agent2policy, exploration_func_index, policy_func_index, update_trigger, warmup
-)
+from .callbacks import post_collect, post_evaluate
+from .env_wrapper import get_env_wrapper, get_eval_env_wrapper
+from .policy_index import agent2policy, rl_policy_func_index, update_trigger, warmup
 
 __all__ = [
-    "agent2exploration", "agent2policy", "exploration_func_index", "get_env_wrapper", "policy_func_index",
-    "update_trigger", "warmup"
-]
+    "agent2policy", "post_collect", "post_evaluate", "get_env_wrapper", "get_eval_env_wrapper",
+    "rl_policy_func_index", "update_trigger", "warmup"
+]
\ No newline at end of file
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index 9b96ea46d..f9fc8efa3 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -13,14 +13,14 @@
 
 vm_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, vm_path)
-from env_wrapper import STATE_DIM
+from env_wrapper import NUM_PMS, STATE_DIM
 
 config = {
     "model": {
         "network": {
             "actor": {
                 "input_dim": STATE_DIM,
-                "output_dim": 9,
+                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
                 "hidden_dims": [64, 32, 32],
                 "activation": "leaky_relu",
                 "softmax": True,
@@ -51,15 +51,12 @@
     "algorithm": {
         "reward_discount": 0.9,
         "train_epochs": 100,
-        "gradient_iters": 1,
         "critic_loss_cls": "mse",
         "critic_loss_coeff": 0.1
     },
     "experience_store": {
-        "capacity": 10000,
-        "overwrite_type": "rolling",
-        "batch_size": -1,
-        "replace": False
+        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
+        "update": {"capacity": 50000, "overwrite_type": "rolling"}
     },
     "sampler": {
         "rollout": {"batch_size": -1, "replace": False},
@@ -71,6 +68,8 @@
 def get_ac_policy(mode="update"):
     class MyACNet(DiscreteACNet):
         def forward(self, states, actor: bool = True, critic: bool = True):
+            if isinstance(states, dict):
+                states = [states]
             inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
             masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
             if len(inputs.shape) == 1:
diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
index 3cadaf46f..39cac9515 100644
--- a/examples/rl/vm_scheduling/callbacks.py
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -5,7 +5,7 @@
 from os import makedirs
 from os.path import dirname, join, realpath
 
-import matplotlib.pyplot as plt
+# import matplotlib.pyplot as plt
 
 from maro.utils import Logger
 
@@ -21,68 +21,69 @@
 simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
 
 def post_collect(trackers, ep, segment):
-    # print the env metric from each rollout worker
-    for tracker in trackers:
-        simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
-
-    # print the average env metric
-    if len(trackers) > 1:
-        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+    # # print the env metric from each rollout worker
+    # for tracker in trackers:
+    #     simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
 
+    # # print the average env metric
+    # if len(trackers) > 1:
+    #     metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+    #     avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+    #     simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+    pass
 
 def post_evaluate(trackers, ep):
-    # print the env metric from each rollout worker
-    for tracker in trackers:
-        simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
-
-    # print the average env metric
-    if len(trackers) > 1:
-        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
-
-    for i, tracker in enumerate(trackers):
-        core_requirement = tracker["vm_core_requirement"]
-        action_sequence = tracker["action_sequence"]
-        # plot action sequence
-        fig = plt.figure(figsize=(40, 32))
-        ax = fig.add_subplot(1, 1, 1)
-        ax.plot(action_sequence)
-        fig.savefig(f"{plt_path}/action_sequence_{ep}")
-        plt.cla()
-        plt.close("all")
-
-        # plot with legal action mask
-        fig = plt.figure(figsize=(40, 32))
-        for idx, key in enumerate(core_requirement.keys()):
-            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-            for i in range(len(core_requirement[key])):
-                if i == 0:
-                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
-                    ax.legend()
-                else:
-                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
-
-        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
-
-        plt.cla()
-        plt.close("all")
-
-        # plot without legal actin mask
-        fig = plt.figure(figsize=(40, 32))
-
-        for idx, key in enumerate(core_requirement.keys()):
-            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-            for i in range(len(core_requirement[key])):
-                if i == 0:
-                    ax.plot(core_requirement[key][i][0], label=str(key))
-                    ax.legend()
-                else:
-                    ax.plot(core_requirement[key][i][0])
-
-        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
-
-        plt.cla()
-        plt.close("all")
+    # # print the env metric from each rollout worker
+    # for tracker in trackers:
+    #     simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+
+    # # print the average env metric
+    # if len(trackers) > 1:
+    #     metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+    #     avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+    #     simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+
+    # for i, tracker in enumerate(trackers):
+    #     core_requirement = tracker["vm_core_requirement"]
+    #     action_sequence = tracker["action_sequence"]
+    #     # plot action sequence
+    #     fig = plt.figure(figsize=(40, 32))
+    #     ax = fig.add_subplot(1, 1, 1)
+    #     ax.plot(action_sequence)
+    #     fig.savefig(f"{plt_path}/action_sequence_{ep}")
+    #     plt.cla()
+    #     plt.close("all")
+
+    #     # plot with legal action mask
+    #     fig = plt.figure(figsize=(40, 32))
+    #     for idx, key in enumerate(core_requirement.keys()):
+    #         ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+    #         for i in range(len(core_requirement[key])):
+    #             if i == 0:
+    #                 ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
+    #                 ax.legend()
+    #             else:
+    #                 ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
+
+    #     fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
+
+    #     plt.cla()
+    #     plt.close("all")
+
+    #     # plot without legal actin mask
+    #     fig = plt.figure(figsize=(40, 32))
+
+    #     for idx, key in enumerate(core_requirement.keys()):
+    #         ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+    #         for i in range(len(core_requirement[key])):
+    #             if i == 0:
+    #                 ax.plot(core_requirement[key][i][0], label=str(key))
+    #                 ax.legend()
+    #             else:
+    #                 ax.plot(core_requirement[key][i][0])
+
+    #     fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
+
+    #     plt.cla()
+    #     plt.close("all")
+    pass
diff --git a/examples/rl/vm_scheduling/dqn.py b/examples/rl/vm_scheduling/dqn.py
index 4350f8d48..754f66617 100644
--- a/examples/rl/vm_scheduling/dqn.py
+++ b/examples/rl/vm_scheduling/dqn.py
@@ -15,14 +15,14 @@
 
 vm_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, vm_path)
-from env_wrapper import STATE_DIM
+from env_wrapper import NUM_PMS, STATE_DIM
 
 config = {
     "model": {
         "network": {
             "input_dim": STATE_DIM,
             "hidden_dims": [64, 128, 256],
-            "output_dim": 9,
+            "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
             "activation": "leaky_relu",
             "softmax": False,
             "batch_norm": False,
@@ -39,14 +39,14 @@
     },
     "algorithm": {
         "reward_discount": 0.9,
-        "target_update_freq": 5,
+        "update_target_every": 5,
         "train_epochs": 100,
         "soft_update_coeff": 0.1,
         "double": False
     },
     "experience_store": {
-        "rollout": {"capacity": 1000, "overwrite_type": "rolling"},
-        "update": {"capacity": 10000, "overwrite_type": "rolling"}
+        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
+        "update": {"capacity": 50000, "overwrite_type": "rolling"}
     },
     "sampler": {
         "rollout": {"batch_size": -1, "replace": False},
@@ -69,6 +69,8 @@ def __init__(self, component, optim_option, device: str = None):
                 torch.nn.init.xavier_uniform_(mdl.weight, gain=torch.nn.init.calculate_gain('leaky_relu'))
 
     def forward(self, states):
+        if isinstance(states, dict):
+            states = [states]
         inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
         masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
         if len(inputs.shape) == 1:
@@ -83,10 +85,13 @@ def __init__(self, epsilon: float = .0):
         self.epsilon = epsilon
 
     def __call__(self, action, state):
+        if isinstance(state, dict):
+            state = [state]
         mask = [st["mask"] for st in state]
-        return np.array(
-            [act if np.random.random() > self.epsilon else np.random.choice(np.where(mask == 1)[0]) for act in action]
-        )
+        return np.array([
+            act if np.random.random() > self.epsilon else np.random.choice(np.where(mk == 1)[0])
+            for act, mk in zip(action, mask)
+        ])
 
 
 def get_dqn_policy(mode="update"):
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index 5aa054cdf..0d3109f79 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -14,8 +14,8 @@ def post_step(env: Env, tracker: dict, transition: Transition):
     if "action_sequence" not in tracker:
         tracker["action_sequence"] = []
 
-    tracker["vm_core_requirement"].append([transition.action, transition.state["mask"]])
-    tracker["action_sequence"].append(transition.action)
+    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
+    tracker["action_sequence"].append(transition.action["AGENT"])
 
 
 class VMEnvWrapper(AbsEnvWrapper):
@@ -26,10 +26,6 @@ def __init__(
         vm_attributes: list,
         alpha: float,
         beta: float,
-        pm_num: int,
-        durations: int,
-        vm_state_path: str,
-        vm_window_size: int = 1,
         pm_window_size: int = 1,
         gamma: float = 0.0,
         reward_eval_delay: int = 0
@@ -38,31 +34,34 @@ def __init__(
         self._pm_attributes = pm_attributes
         self._vm_attributes = vm_attributes
         self._st = 0
-        self._static_vm_states = np.load(vm_state_path)
-        self._vm_window_size = vm_window_size
         self._pm_window_size = pm_window_size
         # adjust the ratio of the success allocation and the total income when computing the reward
         self._alpha = alpha
         self._beta = beta
         self._gamma = gamma # reward discount
-        self._pm_num = pm_num # the number of pms
-        self._durations = durations # the duration of the whole environment
-        self._pm_state_history = np.zeros((pm_window_size - 1, self._pm_num, 2))
-        self._state_dim = 2 * pm_num * pm_window_size + 5 * vm_window_size
+        self._num_pms = self.env._business_engine._pm_amount # the number of pms
+        self._durations = self.env._business_engine._max_tick
+        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
+        self._legal_pm_mask = None
+        self._state_dim = 2 * self._num_pms * pm_window_size + 4
 
     @property
     def state_dim(self):
         return self._state_dim
 
-    def get_state(self):
+    @property
+    def num_pms(self):
+        return self._num_pms
+
+    def get_state(self, tick=None):
         pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
         # get the legal number of PM.
-        legal_pm_mask = np.zeros(self._pm_num + 1)
+        legal_pm_mask = np.zeros(self._num_pms + 1)
         if len(self._event.valid_pms) <= 0:
             # no pm available
-            legal_pm_mask[self._pm_num] = 1
+            legal_pm_mask[self._num_pms] = 1
         else:
-            legal_pm_mask[self._pm_num] = 1
+            legal_pm_mask[self._num_pms] = 1
             remain_cpu_dict = dict()
             for pm in self._event.valid_pms:
                 # if two pm has same remaining cpu, only choose the one which has smaller id
@@ -72,33 +71,38 @@ def get_state(self):
                 else:
                     legal_pm_mask[pm] = 0
 
+        self._legal_pm_mask = legal_pm_mask
         return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
 
     def to_env_action(self, action_info):
+        action_info = action_info["AGENT"]
         model_action = action_info[0] if isinstance(action_info, tuple) else action_info
-        if model_action == self._pm_num:
-            action = PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
+        if model_action == self._num_pms:
+            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
         else:
-            action = AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
-        return {"AGENT": action}
+            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
 
-    def get_reward(self, action_info):
-        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
-        if model_action == self._pm_num:
-            if np.sum(self._state_info["AGENT"]["legal_pm"]) != 1:
+    def get_reward(self, actions, tick=None):
+        if isinstance(actions, PostponeAction):   # postponement
+            if np.sum(self._legal_pm_mask) != 1:
                 reward = -0.1 * self._alpha + 0.0 * self._beta
             else:
                 reward = 0.0 * self._alpha + 0.0 * self._beta
-        else:
+        elif self._event:
+            vm_unit_price = self.env._business_engine._get_unit_price(
+                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+            )
             reward = (
-                1.0 * self._alpha + self._beta * self._event.vm_unit_price *
-                min(self._durations - self._event.frame_index, self._event.vm_lifetime)
+                1.0 * self._alpha + self._beta * vm_unit_price *
+                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
             )
-        return {"AGENT": reward}
+        else:
+            reward = .0
+        return {"AGENT": np.float32(reward)}
 
     def _get_pm_state(self):
         total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
-        total_pm_info = total_pm_info.reshape(self._pm_num, len(self._pm_attributes))
+        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
 
         # normalize the attributes of pms' cpu and memory
         self._max_cpu_capacity = np.max(total_pm_info[:, 0])
@@ -107,47 +111,26 @@ def _get_pm_state(self):
         total_pm_info[:, 3] /= self._max_memory_capacity
 
         # get the remaining cpu and memory of the pms
-        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._pm_num, 1)
-        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._pm_num, 1)
+        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
+        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
 
         # get the pms' information
-        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, pm_num, 2)
+        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
 
         # get the sequence pms' information
         self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[-self._pm_window_size:, :, :].copy() # (win_size, pm_num, 2)
+        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
 
     def _get_vm_state(self):
-        if self._vm_window_size == 1:
-            # get the vm's infomation
-            vm_info = np.array([
-                self._event.vm_cpu_cores_requirement,
-                self._event.vm_memory_requirement,
-                min(self._durations - self.env.tick, self._event.vm_lifetime) / 200,
-                (self._durations - self.env.tick) * 1.0 / 200,
-                self._event.vm_unit_price * min(self._durations - self.env.tick, self._event.vm_lifetime)
-            ], dtype=np.float)
-            vm_info[0] /= self._max_cpu_capacity
-            vm_info[1] /= self._max_memory_capacity
-            return vm_info
-        else:
-            # get the sequence vms' information
-            total_vm_info = np.zeros((self._vm_window_size, len(self._vm_attributes)))
-            for idx in range(self._st, self._st + self._vm_window_size):
-                if idx < self._static_vm_states.shape[0]:
-                    vm_info = self._static_vm_states[idx].copy()
-                    vm_info[0] /= self._max_cpu_capacity
-                    vm_info[1] /= self._max_memory_capacity
-                    vm_info[4] = vm_info[4] * min(self._durations - vm_info[3], vm_info[2])
-                    vm_info[2] = (vm_info[2] * 1.0) / 200
-                    vm_info[3] = (self._durations - vm_info[3]) * 1.0 / 200
-                else:
-                    vm_info = np.zeros(len(self._vm_attributes), dtype=np.float)
-
-                total_vm_info[self._vm_window_size - (idx - self._st + 1), :] = vm_info
-
-            self._st = (self._st + 1) % self._static_vm_states.shape[0]
-            return total_vm_info
+        vm_info = np.array([
+            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
+            self._event.vm_memory_requirement / self._max_memory_capacity,
+            (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
+            self.env._business_engine._get_unit_price(
+                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+            )
+        ], dtype=np.float32)
+        return vm_info
 
 
 env_config = {
@@ -155,18 +138,14 @@ def _get_vm_state(self):
         "scenario": "vm_scheduling",
         "topology": "azure.2019.10k",
         "start_tick": 0,
-        "durations": 8638,
+        "durations": 300,  # 8638
         "snapshot_resolution": 1
     },
     "wrapper": {
         "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"], 
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
         "alpha": 0.0,
         "beta": 1.0,
-        "pm_num": 8,
-        "durations": 200,
-        "vm_state_path": "../data/train_vm_states.npy",
-        "vm_window_size": 1,
         "pm_window_size": 1,
         "gamma": 0.9
     },
@@ -177,21 +156,20 @@ def _get_vm_state(self):
 eval_env_config = {
     "basic": {
         "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k.short.test",
+        "topology": "azure.2019.10k.oversubscription",
         "start_tick": 0,
         "durations": 300,
         "snapshot_resolution": 1
     },
     "wrapper": {
+        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
         "alpha": 0.0,
         "beta": 1.0,
-        "pm_num": 8,
-        "durations": 200,
-        "vm_state_path": "../data/test_vm_states.npy",
-        "vm_window_size": 1,
         "pm_window_size": 1,
         "gamma": 0.9
-    }
+    },
+    "seed": 1024
 }
 
 
@@ -208,6 +186,6 @@ def get_eval_env_wrapper():
 
 
 tmp_env_wrapper = get_env_wrapper()
-AGENT_IDS = tmp_env_wrapper.agent_idx_list
 STATE_DIM = tmp_env_wrapper.state_dim
+NUM_PMS = tmp_env_wrapper.num_pms
 del tmp_env_wrapper
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
index 376e0c2f3..d9f3dad26 100644
--- a/examples/rl/vm_scheduling/policy_index.py
+++ b/examples/rl/vm_scheduling/policy_index.py
@@ -4,16 +4,16 @@
 import os
 import sys
 
-vm_path = os.path.dirname(os.path.realpath(__file__))
-if vm_path not in sys.path:
-    sys.path.insert(0, vm_path)
+
+cim_path = os.path.dirname(os.path.realpath(__file__))
+if cim_path not in sys.path:
+    sys.path.insert(0, cim_path)
 from ac import get_ac_policy
 from dqn import get_dqn_policy
-from env_wrapper import AGENT_IDS
 
-update_trigger = {name: 128 for name in AGENT_IDS}
-warmup = {name: 1 for name in AGENT_IDS}
+update_trigger = {"POLICY": 128}
+warmup = {"POLICY": 1}
 
 # use agent IDs as policy names since each agent uses a separate policy
-policy_func_index = {name: get_ac_policy for name in AGENT_IDS}
-agent2policy = {name: name for name in AGENT_IDS}
+rl_policy_func_index = {"POLICY": get_ac_policy}
+agent2policy = {"AGENT": "POLICY"}
diff --git a/examples/rl/workflows/config.yml b/examples/rl/workflows/config.yml
index f39994b74..5bb1d84d2 100644
--- a/examples/rl/workflows/config.yml
+++ b/examples/rl/workflows/config.yml
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-job_name: cim-dqn 
-scenario: cim
+job_name: cim
+scenario: cim   # cim, vm_scheduling
 mode: sync
 num_episodes: 5
 eval_schedule: 5

From ed4da3a5bb7e6d995d2e392bc5dbd90293b3cc9e Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Mon, 26 Jul 2021 01:11:32 +0000
Subject: [PATCH 12/29] uncommented callbacks for VM scheduling

---
 examples/rl/vm_scheduling/callbacks.py | 126 ++++++++++++-------------
 1 file changed, 62 insertions(+), 64 deletions(-)

diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
index 39cac9515..f82291374 100644
--- a/examples/rl/vm_scheduling/callbacks.py
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -5,7 +5,7 @@
 from os import makedirs
 from os.path import dirname, join, realpath
 
-# import matplotlib.pyplot as plt
+import matplotlib.pyplot as plt
 
 from maro.utils import Logger
 
@@ -21,69 +21,67 @@
 simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
 
 def post_collect(trackers, ep, segment):
-    # # print the env metric from each rollout worker
-    # for tracker in trackers:
-    #     simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
 
-    # # print the average env metric
-    # if len(trackers) > 1:
-    #     metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-    #     avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-    #     simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
-    pass
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
 
 def post_evaluate(trackers, ep):
-    # # print the env metric from each rollout worker
-    # for tracker in trackers:
-    #     simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
-
-    # # print the average env metric
-    # if len(trackers) > 1:
-    #     metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-    #     avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-    #     simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
-
-    # for i, tracker in enumerate(trackers):
-    #     core_requirement = tracker["vm_core_requirement"]
-    #     action_sequence = tracker["action_sequence"]
-    #     # plot action sequence
-    #     fig = plt.figure(figsize=(40, 32))
-    #     ax = fig.add_subplot(1, 1, 1)
-    #     ax.plot(action_sequence)
-    #     fig.savefig(f"{plt_path}/action_sequence_{ep}")
-    #     plt.cla()
-    #     plt.close("all")
-
-    #     # plot with legal action mask
-    #     fig = plt.figure(figsize=(40, 32))
-    #     for idx, key in enumerate(core_requirement.keys()):
-    #         ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-    #         for i in range(len(core_requirement[key])):
-    #             if i == 0:
-    #                 ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
-    #                 ax.legend()
-    #             else:
-    #                 ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
-
-    #     fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
-
-    #     plt.cla()
-    #     plt.close("all")
-
-    #     # plot without legal actin mask
-    #     fig = plt.figure(figsize=(40, 32))
-
-    #     for idx, key in enumerate(core_requirement.keys()):
-    #         ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-    #         for i in range(len(core_requirement[key])):
-    #             if i == 0:
-    #                 ax.plot(core_requirement[key][i][0], label=str(key))
-    #                 ax.legend()
-    #             else:
-    #                 ax.plot(core_requirement[key][i][0])
-
-    #     fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
-
-    #     plt.cla()
-    #     plt.close("all")
-    pass
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+
+    for i, tracker in enumerate(trackers):
+        core_requirement = tracker["vm_core_requirement"]
+        action_sequence = tracker["action_sequence"]
+        # plot action sequence
+        fig = plt.figure(figsize=(40, 32))
+        ax = fig.add_subplot(1, 1, 1)
+        ax.plot(action_sequence)
+        fig.savefig(f"{plt_path}/action_sequence_{ep}")
+        plt.cla()
+        plt.close("all")
+
+        # plot with legal action mask
+        fig = plt.figure(figsize=(40, 32))
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
+
+        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
+
+        # plot without legal actin mask
+        fig = plt.figure(figsize=(40, 32))
+
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0])
+
+        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")

From 03b6b78df03a76bb25341060a7156cedef4ab3ac Mon Sep 17 00:00:00 2001
From: Huoran Li <huoranli@microsoft.com>
Date: Thu, 5 Aug 2021 14:23:50 +0800
Subject: [PATCH 13/29] Minor revision for better code style

---
 examples/rl/vm_scheduling/__init__.py     | 22 +++++++++++-----------
 examples/rl/vm_scheduling/ac.py           |  4 ++--
 examples/rl/vm_scheduling/callbacks.py    |  3 ++-
 examples/rl/vm_scheduling/dqn.py          |  3 +--
 examples/rl/vm_scheduling/env_wrapper.py  | 11 ++++++-----
 examples/rl/vm_scheduling/policy_index.py |  3 +--
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
index 3055ab873..6d4293807 100644
--- a/examples/rl/vm_scheduling/__init__.py
+++ b/examples/rl/vm_scheduling/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .callbacks import post_collect, post_evaluate
-from .env_wrapper import get_env_wrapper, get_eval_env_wrapper
-from .policy_index import agent2policy, rl_policy_func_index, update_trigger, warmup
-
-__all__ = [
-    "agent2policy", "post_collect", "post_evaluate", "get_env_wrapper", "get_eval_env_wrapper",
-    "rl_policy_func_index", "update_trigger", "warmup"
-]
\ No newline at end of file
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .callbacks import post_collect, post_evaluate
+from .env_wrapper import get_env_wrapper, get_eval_env_wrapper
+from .policy_index import agent2policy, rl_policy_func_index, update_trigger, warmup
+
+__all__ = [
+    "agent2policy", "post_collect", "post_evaluate", "get_env_wrapper", "get_eval_env_wrapper",
+    "rl_policy_func_index", "update_trigger", "warmup"
+]
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
index f9fc8efa3..01abe5ceb 100644
--- a/examples/rl/vm_scheduling/ac.py
+++ b/examples/rl/vm_scheduling/ac.py
@@ -7,13 +7,13 @@
 import numpy as np
 import torch
 
+from env_wrapper import NUM_PMS, STATE_DIM
 from maro.rl.experience import ExperienceStore, UniformSampler
 from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
 from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
 
 vm_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, vm_path)
-from env_wrapper import NUM_PMS, STATE_DIM
 
 config = {
     "model": {
@@ -85,7 +85,7 @@ def forward(self, states, actor: bool = True, critic: bool = True):
             "critic": FullyConnectedBlock(**config["model"]["network"]["critic"])
         },
         optim_option={
-            "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
+            "actor": OptimOption(**config["model"]["optimization"]["actor"]),
             "critic": OptimOption(**config["model"]["optimization"]["critic"])
         } if mode != "inference" else None
     )
diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
index f82291374..04571bcdc 100644
--- a/examples/rl/vm_scheduling/callbacks.py
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -17,9 +17,9 @@
 plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
 makedirs(plt_path, exist_ok=True)
 
-
 simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
 
+
 def post_collect(trackers, ep, segment):
     # print the env metric from each rollout worker
     for tracker in trackers:
@@ -31,6 +31,7 @@ def post_collect(trackers, ep, segment):
         avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
         simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
 
+
 def post_evaluate(trackers, ep):
     # print the env metric from each rollout worker
     for tracker in trackers:
diff --git a/examples/rl/vm_scheduling/dqn.py b/examples/rl/vm_scheduling/dqn.py
index 754f66617..cd8a60d21 100644
--- a/examples/rl/vm_scheduling/dqn.py
+++ b/examples/rl/vm_scheduling/dqn.py
@@ -1,4 +1,3 @@
-  
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
@@ -8,6 +7,7 @@
 import numpy as np
 import torch
 
+from env_wrapper import NUM_PMS, STATE_DIM
 from maro.rl.experience import ExperienceStore, UniformSampler
 from maro.rl.exploration import DiscreteSpaceExploration, MultiPhaseLinearExplorationScheduler
 from maro.rl.model import DiscreteQNet, FullyConnectedBlock, OptimOption
@@ -15,7 +15,6 @@
 
 vm_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, vm_path)
-from env_wrapper import NUM_PMS, STATE_DIM
 
 config = {
     "model": {
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
index 0d3109f79..e604b64ce 100644
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ b/examples/rl/vm_scheduling/env_wrapper.py
@@ -7,6 +7,7 @@
 from maro.simulator import Env
 from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
 
+
 def post_step(env: Env, tracker: dict, transition: Transition):
     tracker["env_metric"] = env.metrics
     if "vm_cpu_cores_requirement" not in tracker:
@@ -38,9 +39,9 @@ def __init__(
         # adjust the ratio of the success allocation and the total income when computing the reward
         self._alpha = alpha
         self._beta = beta
-        self._gamma = gamma # reward discount
-        self._num_pms = self.env._business_engine._pm_amount # the number of pms
-        self._durations = self.env._business_engine._max_tick
+        self._gamma = gamma  # reward discount
+        self._num_pms = self.env.business_engine._pm_amount # the number of pms
+        self._durations = self.env.business_engine._max_tick
         self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
         self._legal_pm_mask = None
         self._state_dim = 2 * self._num_pms * pm_window_size + 4
@@ -89,7 +90,7 @@ def get_reward(self, actions, tick=None):
             else:
                 reward = 0.0 * self._alpha + 0.0 * self._beta
         elif self._event:
-            vm_unit_price = self.env._business_engine._get_unit_price(
+            vm_unit_price = self.env.business_engine._get_unit_price(
                 self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
             )
             reward = (
@@ -126,7 +127,7 @@ def _get_vm_state(self):
             self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
             self._event.vm_memory_requirement / self._max_memory_capacity,
             (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
-            self.env._business_engine._get_unit_price(
+            self.env.business_engine._get_unit_price(
                 self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
             )
         ], dtype=np.float32)
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
index d9f3dad26..e57bf752c 100644
--- a/examples/rl/vm_scheduling/policy_index.py
+++ b/examples/rl/vm_scheduling/policy_index.py
@@ -4,12 +4,11 @@
 import os
 import sys
 
+from ac import get_ac_policy
 
 cim_path = os.path.dirname(os.path.realpath(__file__))
 if cim_path not in sys.path:
     sys.path.insert(0, cim_path)
-from ac import get_ac_policy
-from dqn import get_dqn_policy
 
 update_trigger = {"POLICY": 128}
 warmup = {"POLICY": 1}

From 1b4d1fc0cabd2ae3d101b6328aa92671925a31c6 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 09:24:35 +0000
Subject: [PATCH 14/29] added part of vm scheduling RL code

---
 examples/rl/vm_scheduling/__init__.py      |  11 --
 examples/rl/vm_scheduling/ac.py            | 103 -----------
 examples/rl/vm_scheduling/agent_wrapper.py |  35 ++++
 examples/rl/vm_scheduling/env_wrapper.py   | 191 ---------------------
 examples/rl/vm_scheduling/policy_index.py  |  19 --
 5 files changed, 35 insertions(+), 324 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/__init__.py
 delete mode 100644 examples/rl/vm_scheduling/ac.py
 create mode 100644 examples/rl/vm_scheduling/agent_wrapper.py
 delete mode 100644 examples/rl/vm_scheduling/env_wrapper.py
 delete mode 100644 examples/rl/vm_scheduling/policy_index.py

diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
deleted file mode 100644
index 44af25424..000000000
--- a/examples/rl/vm_scheduling/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .callbacks import post_collect, post_evaluate
-from .env_wrapper import get_env_sampler, get_test_env_wrapper
-from .policy_index import agent2policy, rl_policy_func_index, update_trigger, warmup
-
-__all__ = [
-    "agent2policy", "post_collect", "post_evaluate", "get_env_sampler", "get_test_env_wrapper",
-    "rl_policy_func_index", "update_trigger", "warmup"
-]
\ No newline at end of file
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
deleted file mode 100644
index 34eacf69d..000000000
--- a/examples/rl/vm_scheduling/ac.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-import numpy as np
-import torch
-
-from maro.rl.experience import ReplayMemory, UniformSampler
-from maro.rl.model import DiscreteACNet, FullyConnected, OptimOption
-from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
-
-vm_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, vm_path)
-from env_wrapper import NUM_PMS, STATE_DIM
-
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-                "hidden_dims": [64, 32, 32],
-                "activation": "leaky_relu",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "output_dim": 1,
-                "hidden_dims": [256, 128, 64],
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": False,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.0001}
-            },
-            "critic": {
-                "optim_cls": "sgd",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "train_epochs": 100,
-        "critic_loss_cls": "mse",
-        "critic_loss_coeff": 0.1
-    },
-    "replay_memory": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
-}
-
-
-def get_ac_policy(mode="update"):
-    class MyACNet(DiscreteACNet):
-        def forward(self, states, actor: bool = True, critic: bool = True):
-            if isinstance(states, dict):
-                states = [states]
-            inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
-            masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
-            if len(inputs.shape) == 1:
-                inputs = inputs.unsqueeze(dim=0)
-            return (
-                self.component["actor"](inputs) * masks if actor else None,
-                self.component["critic"](inputs) if critic else None
-            )
-
-    ac_net = MyACNet(
-        component={
-            "actor": FullyConnected(**config["model"]["network"]["actor"]),
-            "critic": FullyConnected(**config["model"]["network"]["critic"])
-        },
-        optim_option={
-            "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
-            "critic": OptimOption(**config["model"]["optimization"]["critic"])
-        } if mode != "inference" else None
-    )
-    if mode == "update":
-        exp_store = ReplayMemory(**config["replay_memory"]["update"])
-        exp_sampler_kwargs = config["sampler"]["update"]
-    else:
-        exp_store = ReplayMemory(**config["replay_memory"]["rollout" if mode == "inference" else "update"])
-        exp_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
-
-    return ActorCritic(
-        ac_net, ActorCriticConfig(**config["algorithm"]), exp_store,
-        experience_sampler_cls=UniformSampler,
-        experience_sampler_kwargs=exp_sampler_kwargs
-    )
diff --git a/examples/rl/vm_scheduling/agent_wrapper.py b/examples/rl/vm_scheduling/agent_wrapper.py
new file mode 100644
index 000000000..27e23788a
--- /dev/null
+++ b/examples/rl/vm_scheduling/agent_wrapper.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
+from maro.rl.learning import AgentWrapper
+
+cim_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, cim_path)
+from env_wrapper import AGENT_IDS, env_config
+from policy_index import policy_func_index
+
+
+exploration_config = {
+    "last_ep": 10,
+    "initial_value": 0.4,
+    "final_value": 0.0,
+    "splits": [(5, 0.32)]
+}
+
+def get_agent_wrapper():
+    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
+    epsilon_greedy.register_schedule(
+        scheduler_cls=MultiPhaseLinearExplorationScheduler,
+        param_name="epsilon",
+        **exploration_config
+    )
+    return AgentWrapper(
+        {name: func(learning=False) for name, func in policy_func_index.items()},
+        {name: name for name in AGENT_IDS},
+        exploration_dict={f"EpsilonGreedy": epsilon_greedy},
+        agent2exploration={name: "EpsilonGreedy" for name in AGENT_IDS}
+    )
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
deleted file mode 100644
index 0e2d714e7..000000000
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import numpy as np
-
-from maro.rl.learning import AbsEnvWrapper, Transition
-from maro.simulator import Env
-from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
-
-def post_step(env: Env, tracker: dict, transition: Transition):
-    tracker["env_metric"] = env.metrics
-    if "vm_cpu_cores_requirement" not in tracker:
-        tracker["vm_core_requirement"] = []
-    if "action_sequence" not in tracker:
-        tracker["action_sequence"] = []
-
-    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
-    tracker["action_sequence"].append(transition.action["AGENT"])
-
-
-class VMEnvWrapper(AbsEnvWrapper):
-    def __init__(
-        self,
-        env: Env,
-        pm_attributes: list,
-        vm_attributes: list,
-        alpha: float,
-        beta: float,
-        pm_window_size: int = 1,
-        gamma: float = 0.0,
-        reward_eval_delay: int = 0
-    ):
-        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
-        self._pm_attributes = pm_attributes
-        self._vm_attributes = vm_attributes
-        self._st = 0
-        self._pm_window_size = pm_window_size
-        # adjust the ratio of the success allocation and the total income when computing the reward
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma # reward discount
-        self._num_pms = self.env._business_engine._pm_amount # the number of pms
-        self._durations = self.env._business_engine._max_tick
-        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
-        self._legal_pm_mask = None
-        self._state_dim = 2 * self._num_pms * pm_window_size + 4
-
-    @property
-    def state_dim(self):
-        return self._state_dim
-
-    @property
-    def num_pms(self):
-        return self._num_pms
-
-    def get_state(self, tick=None):
-        pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
-        # get the legal number of PM.
-        legal_pm_mask = np.zeros(self._num_pms + 1)
-        if len(self._event.valid_pms) <= 0:
-            # no pm available
-            legal_pm_mask[self._num_pms] = 1
-        else:
-            legal_pm_mask[self._num_pms] = 1
-            remain_cpu_dict = dict()
-            for pm in self._event.valid_pms:
-                # if two pm has same remaining cpu, only choose the one which has smaller id
-                if pm_state[-1, pm, 0] not in remain_cpu_dict:
-                    remain_cpu_dict[pm_state[-1, pm, 0]] = 1
-                    legal_pm_mask[pm] = 1
-                else:
-                    legal_pm_mask[pm] = 0
-
-        self._legal_pm_mask = legal_pm_mask
-        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
-
-    def to_env_action(self, action_info):
-        action_info = action_info["AGENT"]
-        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
-        if model_action == self._num_pms:
-            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
-        else:
-            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
-
-    def get_reward(self, actions, tick=None):
-        if isinstance(actions, PostponeAction):   # postponement
-            if np.sum(self._legal_pm_mask) != 1:
-                reward = -0.1 * self._alpha + 0.0 * self._beta
-            else:
-                reward = 0.0 * self._alpha + 0.0 * self._beta
-        elif self._event:
-            vm_unit_price = self.env._business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
-            )
-            reward = (
-                1.0 * self._alpha + self._beta * vm_unit_price *
-                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
-            )
-        else:
-            reward = .0
-        return {"AGENT": np.float32(reward)}
-
-    def _get_pm_state(self):
-        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
-        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
-
-        # normalize the attributes of pms' cpu and memory
-        self._max_cpu_capacity = np.max(total_pm_info[:, 0])
-        self._max_memory_capacity = np.max(total_pm_info[:, 1])
-        total_pm_info[:, 2] /= self._max_cpu_capacity
-        total_pm_info[:, 3] /= self._max_memory_capacity
-
-        # get the remaining cpu and memory of the pms
-        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
-        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
-
-        # get the pms' information
-        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
-
-        # get the sequence pms' information
-        self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
-
-    def _get_vm_state(self):
-        vm_info = np.array([
-            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
-            self._event.vm_memory_requirement / self._max_memory_capacity,
-            (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
-            self.env._business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
-            )
-        ], dtype=np.float32)
-        return vm_info
-
-
-env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k",
-        "start_tick": 0,
-        "durations": 300,  # 8638
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 666
-}
-
-
-test_env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k.oversubscription",
-        "start_tick": 0,
-        "durations": 300,
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 1024
-}
-
-
-def get_env_sampler(replay_agent_ids=None):
-    env = Env(**env_config["basic"])
-    env.set_seed(env_config["seed"])
-    return VMEnvWrapper(env, **env_config["wrapper"])
-
-
-def get_test_env_wrapper():
-    test_env = Env(**test_env_config["basic"])
-    test_env.set_seed(test_env_config["seed"])
-    return VMEnvWrapper(test_env, **test_env_config["wrapper"])
-
-
-tmp_env_wrapper = get_env_sampler()
-STATE_DIM = tmp_env_wrapper.state_dim
-NUM_PMS = tmp_env_wrapper.num_pms
-del tmp_env_wrapper
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
deleted file mode 100644
index d9f3dad26..000000000
--- a/examples/rl/vm_scheduling/policy_index.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-if cim_path not in sys.path:
-    sys.path.insert(0, cim_path)
-from ac import get_ac_policy
-from dqn import get_dqn_policy
-
-update_trigger = {"POLICY": 128}
-warmup = {"POLICY": 1}
-
-# use agent IDs as policy names since each agent uses a separate policy
-rl_policy_func_index = {"POLICY": get_ac_policy}
-agent2policy = {"AGENT": "POLICY"}

From 610f68138a1bc9cb5d58d37e8eea5bf7d4667926 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 11:11:48 +0000
Subject: [PATCH 15/29] refined vm env_wrapper code style

---
 examples/rl/vm_scheduling/agent_wrapper.py |  35 --------
 examples/rl/vm_scheduling/config.py        | 100 +++++++++++++++++++++
 examples/rl/vm_scheduling/policies.py      |   0
 3 files changed, 100 insertions(+), 35 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/agent_wrapper.py
 create mode 100644 examples/rl/vm_scheduling/config.py
 create mode 100644 examples/rl/vm_scheduling/policies.py

diff --git a/examples/rl/vm_scheduling/agent_wrapper.py b/examples/rl/vm_scheduling/agent_wrapper.py
deleted file mode 100644
index 27e23788a..000000000
--- a/examples/rl/vm_scheduling/agent_wrapper.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
-from maro.rl.learning import AgentWrapper
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, cim_path)
-from env_wrapper import AGENT_IDS, env_config
-from policy_index import policy_func_index
-
-
-exploration_config = {
-    "last_ep": 10,
-    "initial_value": 0.4,
-    "final_value": 0.0,
-    "splits": [(5, 0.32)]
-}
-
-def get_agent_wrapper():
-    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
-    epsilon_greedy.register_schedule(
-        scheduler_cls=MultiPhaseLinearExplorationScheduler,
-        param_name="epsilon",
-        **exploration_config
-    )
-    return AgentWrapper(
-        {name: func(learning=False) for name, func in policy_func_index.items()},
-        {name: name for name in AGENT_IDS},
-        exploration_dict={f"EpsilonGreedy": epsilon_greedy},
-        agent2exploration={name: "EpsilonGreedy" for name in AGENT_IDS}
-    )
diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
new file mode 100644
index 000000000..629ed65dd
--- /dev/null
+++ b/examples/rl/vm_scheduling/config.py
@@ -0,0 +1,100 @@
+
+config = {
+    "model": {
+        "network": {
+            "actor": {
+                "input_dim": STATE_DIM,
+                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+                "hidden_dims": [64, 32, 32],
+                "activation": "leaky_relu",
+                "softmax": True,
+                "batch_norm": False,
+                "head": True
+            },
+            "critic": {
+                "input_dim": STATE_DIM,
+                "output_dim": 1,
+                "hidden_dims": [256, 128, 64],
+                "activation": "leaky_relu",
+                "softmax": False,
+                "batch_norm": False,
+                "head": True
+            }
+        },
+        "optimization": {
+            "actor": {
+                "optim_cls": "adam",
+                "optim_params": {"lr": 0.0001}
+            },
+            "critic": {
+                "optim_cls": "sgd",
+                "optim_params": {"lr": 0.001}
+            }
+        }
+    },
+    "algorithm": {
+        "reward_discount": 0.9,
+        "train_epochs": 100,
+        "critic_loss_cls": "mse",
+        "critic_loss_coeff": 0.1
+    },
+    "experience_store": {
+        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
+        "update": {"capacity": 50000, "overwrite_type": "rolling"}
+    },
+    "sampler": {
+        "rollout": {"batch_size": -1, "replace": False},
+        "update": {"batch_size": 128, "replace": True}
+    }
+}
+
+
+config = {
+    "model": {
+        "network": {
+            "actor": {
+                "input_dim": STATE_DIM,
+                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+                "hidden_dims": [64, 32, 32],
+                "activation": "leaky_relu",
+                "softmax": True,
+                "batch_norm": False,
+                "head": True
+            },
+            "critic": {
+                "input_dim": STATE_DIM,
+                "output_dim": 1,
+                "hidden_dims": [256, 128, 64],
+                "activation": "leaky_relu",
+                "softmax": False,
+                "batch_norm": False,
+                "head": True
+            }
+        },
+        "optimization": {
+            "actor": {
+                "optim_cls": "adam",
+                "optim_params": {"lr": 0.0001}
+            },
+            "critic": {
+                "optim_cls": "sgd",
+                "optim_params": {"lr": 0.001}
+            }
+        }
+    },
+    "algorithm": {
+        "reward_discount": 0.9,
+        "train_epochs": 100,
+        "critic_loss_cls": "mse",
+        "critic_loss_coeff": 0.1
+    },
+    "experience_store": {
+        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
+        "update": {"capacity": 50000, "overwrite_type": "rolling"}
+    },
+    "sampler": {
+        "rollout": {"batch_size": -1, "replace": False},
+        "update": {"batch_size": 128, "replace": True}
+    }
+}
+
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
new file mode 100644
index 000000000..e69de29bb

From 6ba958f10e1f2c1c2b64df9e0e2043f45db228d8 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Mon, 19 Jul 2021 08:29:11 +0000
Subject: [PATCH 16/29] vm scheduling RL code finished

---
 examples/rl/vm_scheduling/callbacks.py |  87 -----------------
 examples/rl/vm_scheduling/dqn.py       | 124 -------------------------
 2 files changed, 211 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/callbacks.py
 delete mode 100644 examples/rl/vm_scheduling/dqn.py

diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
deleted file mode 100644
index f82291374..000000000
--- a/examples/rl/vm_scheduling/callbacks.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import time
-from os import makedirs
-from os.path import dirname, join, realpath
-
-import matplotlib.pyplot as plt
-
-from maro.utils import Logger
-
-timestamp = str(time.time())
-
-log_dir = join(dirname(realpath(__file__)), "log", timestamp)
-makedirs(log_dir, exist_ok=True)
-
-plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
-makedirs(plt_path, exist_ok=True)
-
-
-simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
-
-def post_collect(trackers, ep, segment):
-    # print the env metric from each rollout worker
-    for tracker in trackers:
-        simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
-
-    # print the average env metric
-    if len(trackers) > 1:
-        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
-
-def post_evaluate(trackers, ep):
-    # print the env metric from each rollout worker
-    for tracker in trackers:
-        simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
-
-    # print the average env metric
-    if len(trackers) > 1:
-        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
-
-    for i, tracker in enumerate(trackers):
-        core_requirement = tracker["vm_core_requirement"]
-        action_sequence = tracker["action_sequence"]
-        # plot action sequence
-        fig = plt.figure(figsize=(40, 32))
-        ax = fig.add_subplot(1, 1, 1)
-        ax.plot(action_sequence)
-        fig.savefig(f"{plt_path}/action_sequence_{ep}")
-        plt.cla()
-        plt.close("all")
-
-        # plot with legal action mask
-        fig = plt.figure(figsize=(40, 32))
-        for idx, key in enumerate(core_requirement.keys()):
-            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-            for i in range(len(core_requirement[key])):
-                if i == 0:
-                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
-                    ax.legend()
-                else:
-                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
-
-        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
-
-        plt.cla()
-        plt.close("all")
-
-        # plot without legal actin mask
-        fig = plt.figure(figsize=(40, 32))
-
-        for idx, key in enumerate(core_requirement.keys()):
-            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-            for i in range(len(core_requirement[key])):
-                if i == 0:
-                    ax.plot(core_requirement[key][i][0], label=str(key))
-                    ax.legend()
-                else:
-                    ax.plot(core_requirement[key][i][0])
-
-        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
-
-        plt.cla()
-        plt.close("all")
diff --git a/examples/rl/vm_scheduling/dqn.py b/examples/rl/vm_scheduling/dqn.py
deleted file mode 100644
index 4f7499779..000000000
--- a/examples/rl/vm_scheduling/dqn.py
+++ /dev/null
@@ -1,124 +0,0 @@
-  
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-import numpy as np
-import torch
-
-from maro.rl.experience import ReplayMemory, UniformSampler
-from maro.rl.exploration import DiscreteSpaceExploration, MultiLinearExplorationScheduler
-from maro.rl.model import DiscreteQNet, FullyConnected, OptimOption
-from maro.rl.policy.algorithms import DQN, DQNConfig
-
-vm_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, vm_path)
-from env_wrapper import NUM_PMS, STATE_DIM
-
-config = {
-    "model": {
-        "network": {
-            "input_dim": STATE_DIM,
-            "hidden_dims": [64, 128, 256],
-            "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-            "activation": "leaky_relu",
-            "softmax": False,
-            "batch_norm": False,
-            "skip_connection": False,
-            "head": True,
-            "dropout_p": 0.0
-        },
-        "optimization": {
-            "optim_cls": "sgd",
-            "optim_params": {"lr": 0.0005},
-            "scheduler_cls": "cosine_annealing_warm_restarts",
-            "scheduler_params": {"T_0": 500, "T_mult": 2}
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "update_target_every": 5,
-        "train_epochs": 100,
-        "soft_update_coeff": 0.1,
-        "double": False
-    },
-    "replay_memory": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 256, "replace": True}
-    },
-    "exploration": {
-        "last_ep": 400,
-        "initial_value": 0.4,
-        "final_value": 0.0,
-        "splits": [(100, 0.32)]
-    }
-}
-
-
-class MyQNet(DiscreteQNet):
-    def __init__(self, component, optim_option, device: str = None):
-        super().__init__(component, optim_option=optim_option, device=device)
-        for mdl in self.modules():
-            if isinstance(mdl, torch.nn.Linear):
-                torch.nn.init.xavier_uniform_(mdl.weight, gain=torch.nn.init.calculate_gain('leaky_relu'))
-
-    def forward(self, states):
-        if isinstance(states, dict):
-            states = [states]
-        inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
-        masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
-        if len(inputs.shape) == 1:
-            inputs = inputs.unsqueeze(dim=0)
-        q_for_all_actions = self.component(inputs)
-        return q_for_all_actions + (masks - 1) * 1e8
-
-
-class MaskedEpsilonGreedy(DiscreteSpaceExploration):
-    def __init__(self, epsilon: float = .0):
-        super().__init__()
-        self.epsilon = epsilon
-
-    def __call__(self, action, state):
-        if isinstance(state, dict):
-            state = [state]
-        mask = [st["mask"] for st in state]
-        return np.array([
-            act if np.random.random() > self.epsilon else np.random.choice(np.where(mk == 1)[0])
-            for act, mk in zip(action, mask)
-        ])
-
-
-def get_dqn_policy(mode="update"):
-    assert mode in {"inference", "update", "inference-update"}
-    q_net = MyQNet(
-        FullyConnected(**config["model"]["network"]),
-        optim_option=OptimOption(**config["model"]["optimization"]) if mode != "inference" else None
-    )
-
-    if mode == "update":
-        exp_store = ReplayMemory(**config["replay_memory"]["update"])
-        exploration = None
-        exp_sampler_kwargs = config["sampler"]["update"]
-    else:
-        exp_store = ReplayMemory(**config["replay_memory"]["rollout"])
-        exploration = MaskedEpsilonGreedy()
-        exploration.register_schedule(
-            scheduler_cls=MultiLinearExplorationScheduler,
-            param_name="epsilon",
-            **config["exploration"]
-        )
-        exp_store = ReplayMemory(**config["replay_memory"]["rollout" if mode == "inference" else "update"])
-        exp_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
-
-    return DQN(
-        q_net, DQNConfig(**config["algorithm"]), exp_store,
-        experience_sampler_cls=UniformSampler,
-        experience_sampler_kwargs=exp_sampler_kwargs,
-        exploration=exploration
-    )

From 5cb38f23e498030d5aba30fc5ef731c7ec8b3cd9 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Tue, 7 Sep 2021 07:22:18 +0000
Subject: [PATCH 17/29] added config.py for vm scheduing

---
 examples/rl/vm_scheduling/config.py      | 119 ++++++++------
 examples/rl/vm_scheduling/env_sampler.py | 192 +++++++++++++++++++++++
 2 files changed, 264 insertions(+), 47 deletions(-)
 create mode 100644 examples/rl/vm_scheduling/env_sampler.py

diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
index 629ed65dd..26b594ed9 100644
--- a/examples/rl/vm_scheduling/config.py
+++ b/examples/rl/vm_scheduling/config.py
@@ -1,51 +1,76 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
 
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-                "hidden_dims": [64, 32, 32],
-                "activation": "leaky_relu",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "output_dim": 1,
-                "hidden_dims": [256, 128, 64],
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": False,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.0001}
-            },
-            "critic": {
-                "optim_cls": "sgd",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "train_epochs": 100,
-        "critic_loss_cls": "mse",
-        "critic_loss_coeff": 0.1
-    },
-    "experience_store": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
+import torch
+from torch.optim import Adam, SGD
+
+
+env_conf = {
+    "scenario": "vm_scheduling",
+    "topology": "azure.2019.10k",
+    "start_tick": 0,
+    "durations": 300,  # 8638
+    "snapshot_resolution": 1
+}
+
+pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+        
+shaping_conf = {
+    "alpha": 0.0,
+    "beta": 1.0,
+    "pm_window_size": 1,
+    "gamma": 0.9,
+    "seed": 666
+}
+
+
+eval_env_conf = {
+    "scenario": "vm_scheduling",
+    "topology": "azure.2019.10k.oversubscription",
+    "start_tick": 0,
+    "durations": 300,
+    "snapshot_resolution": 1
+}
+
+eval_shaping_conf = {
+    "alpha": 0.0,
+    "beta": 1.0,
+    "pm_window_size": 1,
+    "gamma": 0.9,
+    "seed": 1024
+}
+
+actor_net_conf = {
+    "input_dim": STATE_DIM,
+    "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+    "hidden_dims": [64, 32, 32],
+    "activation": torch.nn.LeakyReLU,
+    "softmax": True,
+    "batch_norm": False,
+    "head": True
+}
+
+critic_net_conf = {
+    "input_dim": STATE_DIM,
+    "output_dim": 1,
+    "hidden_dims": [256, 128, 64],
+    "activation": "leaky_relu",
+    "softmax": False,
+    "batch_norm": False,
+    "head": True
+}
+
+actor_optim_conf = (Adam, {"lr": 0.0001})
+critic_optim_conf = (SGD, {"lr": 0.001})
+
+ac_conf = {
+    "reward_discount": 0.9,
+    "grad_iters": 100,
+    "critic_loss_cls": torch.nn.MSELoss,
+    "critic_loss_coeff": 0.1,
+    "max_trajectory_len": 10000,
+    "get_loss_on_rollout": False
 }
 
 
diff --git a/examples/rl/vm_scheduling/env_sampler.py b/examples/rl/vm_scheduling/env_sampler.py
new file mode 100644
index 000000000..929b4a264
--- /dev/null
+++ b/examples/rl/vm_scheduling/env_sampler.py
@@ -0,0 +1,192 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+
+from maro.rl.learning import AbsEnvSampler
+from maro.simulator import Env
+from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
+
+
+def post_step(env: Env, tracker: dict, transition):
+    tracker["env_metric"] = env.metrics
+    if "vm_cpu_cores_requirement" not in tracker:
+        tracker["vm_core_requirement"] = []
+    if "action_sequence" not in tracker:
+        tracker["action_sequence"] = []
+
+    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
+    tracker["action_sequence"].append(transition.action["AGENT"])
+
+
+class VMEnvSampler(AbsEnvSampler):
+    def __init__(
+        self,
+        env: Env,
+        pm_attributes: list,
+        vm_attributes: list,
+        alpha: float,
+        beta: float,
+        pm_window_size: int = 1,
+        gamma: float = 0.0,
+        reward_eval_delay: int = 0
+    ):
+        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
+        self._pm_attributes = pm_attributes
+        self._vm_attributes = vm_attributes
+        self._st = 0
+        self._pm_window_size = pm_window_size
+        # adjust the ratio of the success allocation and the total income when computing the reward
+        self._alpha = alpha
+        self._beta = beta
+        self._gamma = gamma  # reward discount
+        self._num_pms = self.env.business_engine._pm_amount # the number of pms
+        self._durations = self.env.business_engine._max_tick
+        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
+        self._legal_pm_mask = None
+        self._state_dim = 2 * self._num_pms * pm_window_size + 4
+
+    @property
+    def state_dim(self):
+        return self._state_dim
+
+    @property
+    def num_pms(self):
+        return self._num_pms
+
+    def get_state(self, tick=None):
+        pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
+        # get the legal number of PM.
+        legal_pm_mask = np.zeros(self._num_pms + 1)
+        if len(self._event.valid_pms) <= 0:
+            # no pm available
+            legal_pm_mask[self._num_pms] = 1
+        else:
+            legal_pm_mask[self._num_pms] = 1
+            remain_cpu_dict = dict()
+            for pm in self._event.valid_pms:
+                # if two pm has same remaining cpu, only choose the one which has smaller id
+                if pm_state[-1, pm, 0] not in remain_cpu_dict:
+                    remain_cpu_dict[pm_state[-1, pm, 0]] = 1
+                    legal_pm_mask[pm] = 1
+                else:
+                    legal_pm_mask[pm] = 0
+
+        self._legal_pm_mask = legal_pm_mask
+        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
+
+    def to_env_action(self, action_info):
+        action_info = action_info["AGENT"]
+        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
+        if model_action == self._num_pms:
+            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
+        else:
+            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
+
+    def get_reward(self, actions, tick=None):
+        if isinstance(actions, PostponeAction):   # postponement
+            if np.sum(self._legal_pm_mask) != 1:
+                reward = -0.1 * self._alpha + 0.0 * self._beta
+            else:
+                reward = 0.0 * self._alpha + 0.0 * self._beta
+        elif self._event:
+            vm_unit_price = self.env.business_engine._get_unit_price(
+                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+            )
+            reward = (
+                1.0 * self._alpha + self._beta * vm_unit_price *
+                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
+            )
+        else:
+            reward = .0
+        return {"AGENT": np.float32(reward)}
+
+    def _get_pm_state(self):
+        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
+        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
+
+        # normalize the attributes of pms' cpu and memory
+        self._max_cpu_capacity = np.max(total_pm_info[:, 0])
+        self._max_memory_capacity = np.max(total_pm_info[:, 1])
+        total_pm_info[:, 2] /= self._max_cpu_capacity
+        total_pm_info[:, 3] /= self._max_memory_capacity
+
+        # get the remaining cpu and memory of the pms
+        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
+        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
+
+        # get the pms' information
+        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
+
+        # get the sequence pms' information
+        self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
+        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
+
+    def _get_vm_state(self):
+        vm_info = np.array([
+            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
+            self._event.vm_memory_requirement / self._max_memory_capacity,
+            (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
+            self.env.business_engine._get_unit_price(
+                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+            )
+        ], dtype=np.float32)
+        return vm_info
+
+
+env_config = {
+    "basic": {
+        "scenario": "vm_scheduling",
+        "topology": "azure.2019.10k",
+        "start_tick": 0,
+        "durations": 300,  # 8638
+        "snapshot_resolution": 1
+    },
+    "wrapper": {
+        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+        "alpha": 0.0,
+        "beta": 1.0,
+        "pm_window_size": 1,
+        "gamma": 0.9
+    },
+    "seed": 666
+}
+
+
+eval_env_config = {
+    "basic": {
+        "scenario": "vm_scheduling",
+        "topology": "azure.2019.10k.oversubscription",
+        "start_tick": 0,
+        "durations": 300,
+        "snapshot_resolution": 1
+    },
+    "wrapper": {
+        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+        "alpha": 0.0,
+        "beta": 1.0,
+        "pm_window_size": 1,
+        "gamma": 0.9
+    },
+    "seed": 1024
+}
+
+
+def get_env_wrapper(replay_agent_ids=None):
+    env = Env(**env_config["basic"])
+    env.set_seed(env_config["seed"])
+    return VMEnvWrapper(env, **env_config["wrapper"])
+
+
+def get_eval_env_wrapper():
+    eval_env = Env(**eval_env_config["basic"])
+    eval_env.set_seed(eval_env_config["seed"])
+    return VMEnvWrapper(eval_env, **eval_env_config["wrapper"])
+
+
+tmp_env_wrapper = get_env_wrapper()
+STATE_DIM = tmp_env_wrapper.state_dim
+NUM_PMS = tmp_env_wrapper.num_pms
+del tmp_env_wrapper
\ No newline at end of file

From 465411014666bd1b5831c2660bb9ff6a1e8d96b7 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Tue, 7 Sep 2021 11:30:05 +0000
Subject: [PATCH 18/29] vm example refactoring

---
 examples/rl/vm_scheduling/config.py      | 121 ++++++++--------
 examples/rl/vm_scheduling/env_sampler.py | 167 ++++++++---------------
 examples/rl/vm_scheduling/policies.py    | 107 +++++++++++++++
 examples/rl/workflows/config.yml         |   4 +-
 4 files changed, 223 insertions(+), 176 deletions(-)

diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
index 26b594ed9..5b44f0e65 100644
--- a/examples/rl/vm_scheduling/config.py
+++ b/examples/rl/vm_scheduling/config.py
@@ -2,7 +2,9 @@
 # Licensed under the MIT license.
 
 import torch
-from torch.optim import Adam, SGD
+from torch.optim import Adam, SGD, lr_scheduler
+
+from maro.simulator import Env
 
 
 env_conf = {
@@ -13,37 +15,38 @@
     "snapshot_resolution": 1
 }
 
+num_pms = Env(**env_conf).business_engine._pm_amount
+pm_window_size = 1
+state_dim = 2 * num_pms * pm_window_size + 4
+
 pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        
-shaping_conf = {
+# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+
+
+reward_shaping_conf = {
     "alpha": 0.0,
-    "beta": 1.0,
-    "pm_window_size": 1,
-    "gamma": 0.9,
-    "seed": 666
+    "beta": 1.0
 }
+seed = 666
 
-
-eval_env_conf = {
+test_env_conf = {
     "scenario": "vm_scheduling",
     "topology": "azure.2019.10k.oversubscription",
     "start_tick": 0,
     "durations": 300,
     "snapshot_resolution": 1
 }
-
-eval_shaping_conf = {
+test_reward_shaping_conf = {
     "alpha": 0.0,
-    "beta": 1.0,
-    "pm_window_size": 1,
-    "gamma": 0.9,
-    "seed": 1024
+    "beta": 1.0
 }
 
+test_seed = 1024
+
+######################################### A2C settings ########################################
 actor_net_conf = {
-    "input_dim": STATE_DIM,
-    "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+    "input_dim": state_dim,
+    "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
     "hidden_dims": [64, 32, 32],
     "activation": torch.nn.LeakyReLU,
     "softmax": True,
@@ -52,10 +55,10 @@
 }
 
 critic_net_conf = {
-    "input_dim": STATE_DIM,
+    "input_dim": state_dim,
     "output_dim": 1,
     "hidden_dims": [256, 128, 64],
-    "activation": "leaky_relu",
+    "activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": False,
     "head": True
@@ -73,53 +76,37 @@
     "get_loss_on_rollout": False
 }
 
+######################################### DQN settings ########################################
+q_net_conf = {
+    "input_dim": state_dim,
+    "hidden_dims": [64, 128, 256],
+    "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
+    "activation": torch.nn.LeakyReLU,
+    "softmax": False,
+    "batch_norm": False,
+    "skip_connection": False,
+    "head": True,
+    "dropout_p": 0.0
+}
 
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-                "hidden_dims": [64, 32, 32],
-                "activation": "leaky_relu",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "output_dim": 1,
-                "hidden_dims": [256, 128, 64],
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": False,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.0001}
-            },
-            "critic": {
-                "optim_cls": "sgd",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "train_epochs": 100,
-        "critic_loss_cls": "mse",
-        "critic_loss_coeff": 0.1
-    },
-    "experience_store": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
+q_net_optim_conf = (SGD, {"lr": 0.0005})
+q_net_lr_scheduler_conf = (lr_scheduler.CosineAnnealingWarmRestarts, {"T_0": 500, "T_mult": 2})
+
+dqn_conf = {
+    "reward_discount": 0.9,
+    "update_target_every": 5,
+    "train_epochs": 100,
+    "soft_update_coeff": 0.1,
+    "double": False,
+    "replay_memory_capacity": 10000,
+    "rollout_batch_size": 2560,
+    "train_batch_size": 256,
 }
 
+
+exploration_conf = {
+    "last_ep": 400,
+    "initial_value": 0.4,
+    "final_value": 0.0,
+    "splits": [(100, 0.32)]
+}
diff --git a/examples/rl/vm_scheduling/env_sampler.py b/examples/rl/vm_scheduling/env_sampler.py
index 929b4a264..c2282984c 100644
--- a/examples/rl/vm_scheduling/env_sampler.py
+++ b/examples/rl/vm_scheduling/env_sampler.py
@@ -1,12 +1,24 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import sys
+from os.path import dirname, realpath
+
 import numpy as np
 
+from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.rl.learning import AbsEnvSampler
 from maro.simulator import Env
 from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
 
+vm_path = dirname(realpath(__file__))
+sys.path.insert(0, vm_path)
+from config import (
+    env_conf, exploration_conf, pm_attributes, pm_window_size, reward_shaping_conf, seed, test_env_conf,
+    test_reward_shaping_conf, test_seed
+)
+from policies import policy_func_dict
+
 
 def post_step(env: Env, tracker: dict, transition):
     tracker["env_metric"] = env.metrics
@@ -22,49 +34,37 @@ def post_step(env: Env, tracker: dict, transition):
 class VMEnvSampler(AbsEnvSampler):
     def __init__(
         self,
-        env: Env,
-        pm_attributes: list,
-        vm_attributes: list,
-        alpha: float,
-        beta: float,
-        pm_window_size: int = 1,
-        gamma: float = 0.0,
-        reward_eval_delay: int = 0
+        get_env,
+        get_policy_func_dict,
+        exploration_scheduler_option,
+        agent2policy,
+        get_test_env=None,
+        post_step=None
     ):
-        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
-        self._pm_attributes = pm_attributes
-        self._vm_attributes = vm_attributes
-        self._st = 0
-        self._pm_window_size = pm_window_size
+        super().__init__(
+            get_env, get_policy_func_dict, exploration_scheduler_option, agent2policy,
+            get_test_env=get_test_env, post_step=post_step
+        )
+        self._learn_env.set_seed(seed)
+        self._test_env.set_seed(test_seed)
+
         # adjust the ratio of the success allocation and the total income when computing the reward
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma  # reward discount
-        self._num_pms = self.env.business_engine._pm_amount # the number of pms
+        self.num_pms = self.env.business_engine._pm_amount # the number of pms
         self._durations = self.env.business_engine._max_tick
-        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
+        self._pm_state_history = np.zeros((pm_window_size - 1, self.num_pms, 2))
         self._legal_pm_mask = None
-        self._state_dim = 2 * self._num_pms * pm_window_size + 4
-
-    @property
-    def state_dim(self):
-        return self._state_dim
-
-    @property
-    def num_pms(self):
-        return self._num_pms
 
     def get_state(self, tick=None):
         pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
         # get the legal number of PM.
-        legal_pm_mask = np.zeros(self._num_pms + 1)
-        if len(self._event.valid_pms) <= 0:
+        legal_pm_mask = np.zeros(self.num_pms + 1)
+        if len(self.event.valid_pms) <= 0:
             # no pm available
-            legal_pm_mask[self._num_pms] = 1
+            legal_pm_mask[self.num_pms] = 1
         else:
-            legal_pm_mask[self._num_pms] = 1
+            legal_pm_mask[self.num_pms] = 1
             remain_cpu_dict = dict()
-            for pm in self._event.valid_pms:
+            for pm in self.event.valid_pms:
                 # if two pm has same remaining cpu, only choose the one which has smaller id
                 if pm_state[-1, pm, 0] not in remain_cpu_dict:
                     remain_cpu_dict[pm_state[-1, pm, 0]] = 1
@@ -73,37 +73,38 @@ def get_state(self, tick=None):
                     legal_pm_mask[pm] = 0
 
         self._legal_pm_mask = legal_pm_mask
-        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
+        return {"AGENT": np.concatenate((pm_state.flatten(), vm_state.flatten(), legal_pm_mask))}
 
     def to_env_action(self, action_info):
         action_info = action_info["AGENT"]
         model_action = action_info[0] if isinstance(action_info, tuple) else action_info
-        if model_action == self._num_pms:
-            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
+        if model_action == self.num_pms:
+            return PostponeAction(vm_id=self.event.vm_id, postpone_step=1)
         else:
-            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
+            return AllocateAction(vm_id=self.event.vm_id, pm_id=model_action)
 
-    def get_reward(self, actions, tick=None):
+    def get_reward(self, actions, tick):
+        conf = reward_shaping_conf if self.env == self._learn_env else test_reward_shaping_conf
         if isinstance(actions, PostponeAction):   # postponement
             if np.sum(self._legal_pm_mask) != 1:
-                reward = -0.1 * self._alpha + 0.0 * self._beta
+                reward = -0.1 * conf["alpha"] + 0.0 * conf["beta"]
             else:
-                reward = 0.0 * self._alpha + 0.0 * self._beta
-        elif self._event:
+                reward = 0.0 * conf["alpha"] + 0.0 * conf["beta"]
+        elif self.event:
             vm_unit_price = self.env.business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+                self.event.vm_cpu_cores_requirement, self.event.vm_memory_requirement
             )
             reward = (
-                1.0 * self._alpha + self._beta * vm_unit_price *
-                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
+                1.0 * conf["alpha"] + conf["beta"] * vm_unit_price *
+                min(self._durations - self.event.frame_index, self.event.remaining_buffer_time)
             )
         else:
             reward = .0
         return {"AGENT": np.float32(reward)}
 
     def _get_pm_state(self):
-        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
-        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
+        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::pm_attributes]
+        total_pm_info = total_pm_info.reshape(self.num_pms, len(pm_attributes))
 
         # normalize the attributes of pms' cpu and memory
         self._max_cpu_capacity = np.max(total_pm_info[:, 0])
@@ -112,81 +113,33 @@ def _get_pm_state(self):
         total_pm_info[:, 3] /= self._max_memory_capacity
 
         # get the remaining cpu and memory of the pms
-        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
-        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
+        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self.num_pms, 1)
+        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self.num_pms, 1)
 
         # get the pms' information
         total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
 
         # get the sequence pms' information
         self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
+        return self._pm_state_history[pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
 
     def _get_vm_state(self):
         vm_info = np.array([
-            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
-            self._event.vm_memory_requirement / self._max_memory_capacity,
+            self.event.vm_cpu_cores_requirement / self._max_cpu_capacity,
+            self.event.vm_memory_requirement / self._max_memory_capacity,
             (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
             self.env.business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+                self.event.vm_cpu_cores_requirement, self.event.vm_memory_requirement
             )
         ], dtype=np.float32)
         return vm_info
 
 
-env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k",
-        "start_tick": 0,
-        "durations": 300,  # 8638
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 666
-}
-
-
-eval_env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k.oversubscription",
-        "start_tick": 0,
-        "durations": 300,
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 1024
-}
-
-
-def get_env_wrapper(replay_agent_ids=None):
-    env = Env(**env_config["basic"])
-    env.set_seed(env_config["seed"])
-    return VMEnvWrapper(env, **env_config["wrapper"])
-
-
-def get_eval_env_wrapper():
-    eval_env = Env(**eval_env_config["basic"])
-    eval_env.set_seed(eval_env_config["seed"])
-    return VMEnvWrapper(eval_env, **eval_env_config["wrapper"])
-
-
-tmp_env_wrapper = get_env_wrapper()
-STATE_DIM = tmp_env_wrapper.state_dim
-NUM_PMS = tmp_env_wrapper.num_pms
-del tmp_env_wrapper
\ No newline at end of file
+def get_env_sampler():
+    return VMEnvSampler(
+        get_env=lambda: Env(**env_conf),
+        get_policy_func_dict=policy_func_dict,
+        exploration_scheduler_option={"dqn": {"epsilon": (MultiLinearExplorationScheduler, exploration_conf)}},
+        agent2policy={"AGENT": "dqn"},
+        get_test_env=lambda: Env(**test_env_conf)
+    )
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
index e69de29bb..25e109a65 100644
--- a/examples/rl/vm_scheduling/policies.py
+++ b/examples/rl/vm_scheduling/policies.py
@@ -0,0 +1,107 @@
+
+import sys
+from os.path import dirname, realpath
+
+import numpy as np
+import torch
+
+from maro.rl.modeling import DiscreteACNet, DiscreteQNet, FullyConnected
+from maro.rl.policy import DQN, ActorCritic
+
+vm_path = dirname(realpath(__file__))
+sys.path.insert(0, vm_path)
+from config import (
+    ac_conf, actor_net_conf, actor_optim_conf, critic_net_conf, critic_optim_conf, dqn_conf, q_net_conf,
+    q_net_optim_conf, state_dim
+)
+
+
+class MyQNet(DiscreteQNet):
+    def __init__(self):
+        super().__init__()
+        for mdl in self.modules():
+            if isinstance(mdl, torch.nn.Linear):
+                torch.nn.init.xavier_uniform_(mdl.weight, gain=torch.nn.init.calculate_gain('leaky_relu'))
+
+        self.fc = FullyConnected(**q_net_conf)
+        self.optim = q_net_optim_conf[0](self.fc.parameters(), **q_net_optim_conf[1])
+
+    @property
+    def input_dim(self):
+        return state_dim
+
+    @property
+    def num_actions(self):
+        return q_net_conf["output_dim"]
+
+    def forward(self, states):
+        inputs = states[:, :state_dim]
+        masks = states[:, state_dim:]
+        q_for_all_actions = self.fc(inputs)
+        return q_for_all_actions + (masks - 1) * 1e8
+
+    def step(self, loss):
+        self.optim.zero_grad()
+        loss.backward()
+        self.optim.step()
+
+    def get_gradients(self, loss):
+        self.optim.zero_grad()
+        loss.backward()
+        return {name: param.grad for name, param in self.named_parameters()}
+
+    def apply_gradients(self, grad):
+        for name, param in self.named_parameters():
+            param.grad = grad[name]
+
+        self.optim.step()
+
+
+def masked_eps_greedy(action, num_actions, state, *, epsilon):
+    mask = [st["mask"] for st in state]
+    return np.array([
+        act if np.random.random() > epsilon else np.random.choice(np.where(mk == 1)[0])
+        for act, mk in zip(action, mask)
+    ])
+
+
+class MyACNet(DiscreteACNet):
+    def __init__(self):
+        super().__init__()
+        self.actor = FullyConnected(**actor_net_conf)
+        self.critic = FullyConnected(**critic_net_conf)
+        self.actor_optim = actor_optim_conf[0](self.actor.parameters(), **actor_optim_conf[1])
+        self.critic_optim = critic_optim_conf[0](self.critic.parameters(), **critic_optim_conf[1])
+
+    @property
+    def input_dim(self):
+        return state_dim
+
+    def forward(self, states, actor: bool = True, critic: bool = True):
+        return (self.actor(states) if actor else None), (self.critic(states) if critic else None)
+
+    def step(self, loss):
+        self.actor_optim.zero_grad()
+        self.critic_optim.zero_grad()
+        loss.backward()
+        self.actor_optim.step()
+        self.critic_optim.step()
+
+    def get_gradients(self, loss):
+        self.actor_optim.zero_grad()
+        self.critic_optim.zero_grad()
+        loss.backward()
+        return {name: param.grad for name, param in self.named_parameters()}
+
+    def apply_gradients(self, grad):
+        for name, param in self.named_parameters():
+            param.grad = grad[name]
+
+        self.actor_optim.step()
+        self.critic_optim.step()
+
+
+policy_func_dict = {
+    "dqn": lambda name: DQN(name, MyQNet(), **dqn_conf),
+    "ac": lambda name: ActorCritic(name, MyACNet(), **ac_conf)
+}
diff --git a/examples/rl/workflows/config.yml b/examples/rl/workflows/config.yml
index f88293441..d97b89e65 100644
--- a/examples/rl/workflows/config.yml
+++ b/examples/rl/workflows/config.yml
@@ -24,9 +24,9 @@ async:
   group: async
   num_actors: 3
 policy_manager:
-  type: distributed   # simple, distributed
+  type: simple   # simple, distributed
   simple:
-    parallel: false
+    parallel: true
   distributed:
     group: learn
     num_hosts: 2

From c51b9e63476b0976970b5ac3a77b1ce9bc3acae1 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Sun, 12 Sep 2021 09:05:08 +0000
Subject: [PATCH 19/29] fixed bugs in vm_scheduling

---
 docker_files/dev.df                      |  1 +
 examples/rl/vm_scheduling/__init__.py    |  8 +++
 examples/rl/vm_scheduling/callbacks.py   | 86 ++++++++++++++++++++++++
 examples/rl/vm_scheduling/config.py      | 43 +++++++-----
 examples/rl/vm_scheduling/env_sampler.py | 55 ++++++---------
 examples/rl/vm_scheduling/policies.py    | 37 ++++------
 maro/rl/policy/ac.py                     |  6 +-
 7 files changed, 162 insertions(+), 74 deletions(-)
 create mode 100644 examples/rl/vm_scheduling/__init__.py
 create mode 100644 examples/rl/vm_scheduling/callbacks.py

diff --git a/docker_files/dev.df b/docker_files/dev.df
index f2f126eb5..4b08d2671 100644
--- a/docker_files/dev.df
+++ b/docker_files/dev.df
@@ -20,6 +20,7 @@ RUN pip install --no-cache-dir pyzmq==19.0.2
 RUN pip install --no-cache-dir numpy==1.19.1
 RUN pip install --no-cache-dir torch==1.6.0
 RUN pip install --no-cache-dir scipy
+RUN pip install --no-cache-dir matplotlib
 RUN pip install --no-cache-dir redis
 
 COPY maro /maro/maro
diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
new file mode 100644
index 000000000..280488bcc
--- /dev/null
+++ b/examples/rl/vm_scheduling/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .callbacks import post_collect, post_evaluate
+from .env_sampler import get_env_sampler
+from .policies import policy_func_dict
+
+__all__ = ["post_collect", "post_evaluate", "get_env_sampler", "policy_func_dict"]
diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
new file mode 100644
index 000000000..6793ae555
--- /dev/null
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -0,0 +1,86 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import time
+from os import makedirs
+from os.path import dirname, join, realpath
+
+from matplotlib import pyplot as plt
+
+from maro.utils import Logger
+
+timestamp = str(time.time())
+log_dir = join(dirname(realpath(__file__)), "log", timestamp)
+makedirs(log_dir, exist_ok=True)
+plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
+makedirs(plt_path, exist_ok=True)
+
+logger = Logger("SIMUALTION", dump_folder=log_dir)
+
+
+def post_collect(trackers, ep, segment):
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+
+
+def post_evaluate(trackers, ep):
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+
+    for i, tracker in enumerate(trackers):
+        core_requirement = tracker["vm_core_requirement"]
+        action_sequence = tracker["action_sequence"]
+        # plot action sequence
+        fig = plt.figure(figsize=(40, 32))
+        ax = fig.add_subplot(1, 1, 1)
+        ax.plot(action_sequence)
+        fig.savefig(f"{plt_path}/action_sequence_{ep}")
+        plt.cla()
+        plt.close("all")
+
+        # plot with legal action mask
+        fig = plt.figure(figsize=(40, 32))
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
+
+        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
+
+        # plot without legal actin mask
+        fig = plt.figure(figsize=(40, 32))
+
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0])
+
+        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
index 5b44f0e65..946cc1030 100644
--- a/examples/rl/vm_scheduling/config.py
+++ b/examples/rl/vm_scheduling/config.py
@@ -1,9 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import numpy as np
 import torch
 from torch.optim import Adam, SGD, lr_scheduler
 
+from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.simulator import Env
 
 
@@ -17,10 +19,10 @@
 
 num_pms = Env(**env_conf).business_engine._pm_amount
 pm_window_size = 1
-state_dim = 2 * num_pms * pm_window_size + 4
+num_features = 2 * num_pms * pm_window_size + 4
 
-pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"]
+# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"]
 
 
 reward_shaping_conf = {
@@ -43,9 +45,11 @@
 
 test_seed = 1024
 
+algorithm = "ac"  # "dqn" or "ac"
+
 ######################################### A2C settings ########################################
 actor_net_conf = {
-    "input_dim": state_dim,
+    "input_dim": num_features,
     "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
     "hidden_dims": [64, 32, 32],
     "activation": torch.nn.LeakyReLU,
@@ -55,7 +59,7 @@
 }
 
 critic_net_conf = {
-    "input_dim": state_dim,
+    "input_dim": num_features,
     "output_dim": 1,
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.LeakyReLU,
@@ -78,7 +82,7 @@
 
 ######################################### DQN settings ########################################
 q_net_conf = {
-    "input_dim": state_dim,
+    "input_dim": num_features,
     "hidden_dims": [64, 128, 256],
     "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
     "activation": torch.nn.LeakyReLU,
@@ -92,21 +96,30 @@
 q_net_optim_conf = (SGD, {"lr": 0.0005})
 q_net_lr_scheduler_conf = (lr_scheduler.CosineAnnealingWarmRestarts, {"T_0": 500, "T_mult": 2})
 
+
+def masked_eps_greedy(states, actions, num_actions, *, epsilon):
+    masks = states[:, num_features:]
+    return np.array([
+        action if np.random.random() > epsilon else np.random.choice(np.where(mask == 1)[0])
+        for action, mask in zip(actions, masks)
+    ])
+
 dqn_conf = {
     "reward_discount": 0.9,
     "update_target_every": 5,
-    "train_epochs": 100,
+    "num_epochs": 100,
     "soft_update_coeff": 0.1,
     "double": False,
+    "exploration_strategy": (masked_eps_greedy, {"epsilon": 0.4}),
+    "exploration_scheduling_options": [(
+        "epsilon", MultiLinearExplorationScheduler, {
+            "splits": [(100, 0.32)],
+            "initial_value": 0.4,
+            "last_ep": 400,
+            "final_value": 0.0,
+        }
+    )],
     "replay_memory_capacity": 10000,
     "rollout_batch_size": 2560,
     "train_batch_size": 256,
 }
-
-
-exploration_conf = {
-    "last_ep": 400,
-    "initial_value": 0.4,
-    "final_value": 0.0,
-    "splits": [(100, 0.32)]
-}
diff --git a/examples/rl/vm_scheduling/env_sampler.py b/examples/rl/vm_scheduling/env_sampler.py
index c2282984c..129b24837 100644
--- a/examples/rl/vm_scheduling/env_sampler.py
+++ b/examples/rl/vm_scheduling/env_sampler.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.rl.learning import AbsEnvSampler
 from maro.simulator import Env
 from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
@@ -14,43 +13,34 @@
 vm_path = dirname(realpath(__file__))
 sys.path.insert(0, vm_path)
 from config import (
-    env_conf, exploration_conf, pm_attributes, pm_window_size, reward_shaping_conf, seed, test_env_conf,
+    algorithm, env_conf, pm_attributes, pm_window_size, reward_shaping_conf, num_features, seed, test_env_conf,
     test_reward_shaping_conf, test_seed
 )
 from policies import policy_func_dict
 
 
-def post_step(env: Env, tracker: dict, transition):
-    tracker["env_metric"] = env.metrics
+def post_step(env, tracker: dict, state, action, env_actions, reward, tick):
+    tracker["env_metric"] = {key: metric for key, metric in env.metrics.items() if key != "total_latency"}
+    tracker["env_metric"]["latency_due_to_agent"] = env.metrics["total_latency"].due_to_agent
+    tracker["env_metric"]["latency_due_to_resource"] = env.metrics["total_latency"].due_to_resource
     if "vm_cpu_cores_requirement" not in tracker:
-        tracker["vm_core_requirement"] = []
+        tracker["vm_cpu_cores_requirement"] = []
     if "action_sequence" not in tracker:
         tracker["action_sequence"] = []
 
-    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
-    tracker["action_sequence"].append(transition.action["AGENT"])
+    tracker["vm_cpu_cores_requirement"].append([action, state[num_features:]])
+    tracker["action_sequence"].append(action)
 
 
 class VMEnvSampler(AbsEnvSampler):
-    def __init__(
-        self,
-        get_env,
-        get_policy_func_dict,
-        exploration_scheduler_option,
-        agent2policy,
-        get_test_env=None,
-        post_step=None
-    ):
-        super().__init__(
-            get_env, get_policy_func_dict, exploration_scheduler_option, agent2policy,
-            get_test_env=get_test_env, post_step=post_step
-        )
+    def __init__(self, get_env, get_policy_func_dict, agent2policy, get_test_env=None, post_step=None):
+        super().__init__(get_env, get_policy_func_dict, agent2policy, get_test_env=get_test_env, post_step=post_step)
         self._learn_env.set_seed(seed)
         self._test_env.set_seed(test_seed)
 
         # adjust the ratio of the success allocation and the total income when computing the reward
-        self.num_pms = self.env.business_engine._pm_amount # the number of pms
-        self._durations = self.env.business_engine._max_tick
+        self.num_pms = self._learn_env.business_engine._pm_amount # the number of pms
+        self._durations = self._learn_env.business_engine._max_tick
         self._pm_state_history = np.zeros((pm_window_size - 1, self.num_pms, 2))
         self._legal_pm_mask = None
 
@@ -65,7 +55,7 @@ def get_state(self, tick=None):
             legal_pm_mask[self.num_pms] = 1
             remain_cpu_dict = dict()
             for pm in self.event.valid_pms:
-                # if two pm has same remaining cpu, only choose the one which has smaller id
+                # If two pms have the same remaining cpu, choose the one with the smaller id
                 if pm_state[-1, pm, 0] not in remain_cpu_dict:
                     remain_cpu_dict[pm_state[-1, pm, 0]] = 1
                     legal_pm_mask[pm] = 1
@@ -73,11 +63,11 @@ def get_state(self, tick=None):
                     legal_pm_mask[pm] = 0
 
         self._legal_pm_mask = legal_pm_mask
-        return {"AGENT": np.concatenate((pm_state.flatten(), vm_state.flatten(), legal_pm_mask))}
+        return {"AGENT": np.concatenate((pm_state.flatten(), vm_state.flatten(), legal_pm_mask)).astype(np.float32)}
 
-    def to_env_action(self, action_info):
+    def get_env_actions(self, action_info):
         action_info = action_info["AGENT"]
-        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
+        model_action = action_info["action"] if isinstance(action_info, dict) else action_info
         if model_action == self.num_pms:
             return PostponeAction(vm_id=self.event.vm_id, postpone_step=1)
         else:
@@ -121,25 +111,24 @@ def _get_pm_state(self):
 
         # get the sequence pms' information
         self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
+        return self._pm_state_history[-pm_window_size:, :, :] # (win_size, num_pms, 2)
 
     def _get_vm_state(self):
-        vm_info = np.array([
+        return np.array([
             self.event.vm_cpu_cores_requirement / self._max_cpu_capacity,
             self.event.vm_memory_requirement / self._max_memory_capacity,
             (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
             self.env.business_engine._get_unit_price(
                 self.event.vm_cpu_cores_requirement, self.event.vm_memory_requirement
             )
-        ], dtype=np.float32)
-        return vm_info
+        ])
 
 
 def get_env_sampler():
     return VMEnvSampler(
         get_env=lambda: Env(**env_conf),
         get_policy_func_dict=policy_func_dict,
-        exploration_scheduler_option={"dqn": {"epsilon": (MultiLinearExplorationScheduler, exploration_conf)}},
-        agent2policy={"AGENT": "dqn"},
-        get_test_env=lambda: Env(**test_env_conf)
+        agent2policy={"AGENT": algorithm},
+        get_test_env=lambda: Env(**test_env_conf),
+        post_step=post_step
     )
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
index 25e109a65..92b79b72a 100644
--- a/examples/rl/vm_scheduling/policies.py
+++ b/examples/rl/vm_scheduling/policies.py
@@ -2,7 +2,6 @@
 import sys
 from os.path import dirname, realpath
 
-import numpy as np
 import torch
 
 from maro.rl.modeling import DiscreteACNet, DiscreteQNet, FullyConnected
@@ -11,8 +10,8 @@
 vm_path = dirname(realpath(__file__))
 sys.path.insert(0, vm_path)
 from config import (
-    ac_conf, actor_net_conf, actor_optim_conf, critic_net_conf, critic_optim_conf, dqn_conf, q_net_conf,
-    q_net_optim_conf, state_dim
+    ac_conf, actor_net_conf, actor_optim_conf, algorithm, critic_net_conf, critic_optim_conf, dqn_conf, q_net_conf,
+    num_features, num_pms, q_net_optim_conf
 )
 
 
@@ -28,16 +27,15 @@ def __init__(self):
 
     @property
     def input_dim(self):
-        return state_dim
+        return num_features + num_pms + 1
 
     @property
     def num_actions(self):
         return q_net_conf["output_dim"]
 
-    def forward(self, states):
-        inputs = states[:, :state_dim]
-        masks = states[:, state_dim:]
-        q_for_all_actions = self.fc(inputs)
+    def forward(self, states): 
+        masks = states[:, num_features:]
+        q_for_all_actions = self.fc(states[:, :num_features])
         return q_for_all_actions + (masks - 1) * 1e8
 
     def step(self, loss):
@@ -57,14 +55,6 @@ def apply_gradients(self, grad):
         self.optim.step()
 
 
-def masked_eps_greedy(action, num_actions, state, *, epsilon):
-    mask = [st["mask"] for st in state]
-    return np.array([
-        act if np.random.random() > epsilon else np.random.choice(np.where(mk == 1)[0])
-        for act, mk in zip(action, mask)
-    ])
-
-
 class MyACNet(DiscreteACNet):
     def __init__(self):
         super().__init__()
@@ -75,10 +65,12 @@ def __init__(self):
 
     @property
     def input_dim(self):
-        return state_dim
+        return num_features + num_pms + 1
 
     def forward(self, states, actor: bool = True, critic: bool = True):
-        return (self.actor(states) if actor else None), (self.critic(states) if critic else None)
+        features = states[:, :num_features].to()
+        masks = states[:, num_features:]
+        return (self.actor(features) * masks if actor else None), (self.critic(features) if critic else None)
 
     def step(self, loss):
         self.actor_optim.zero_grad()
@@ -100,8 +92,7 @@ def apply_gradients(self, grad):
         self.actor_optim.step()
         self.critic_optim.step()
 
-
-policy_func_dict = {
-    "dqn": lambda name: DQN(name, MyQNet(), **dqn_conf),
-    "ac": lambda name: ActorCritic(name, MyACNet(), **ac_conf)
-}
+if algorithm == "dqn":
+    policy_func_dict = {"dqn": lambda name: DQN(name, MyQNet(), **dqn_conf)}
+else:
+    policy_func_dict = {"ac": lambda name: ActorCritic(name, MyACNet(), **ac_conf)}
diff --git a/maro/rl/policy/ac.py b/maro/rl/policy/ac.py
index 62d041959..d328b318e 100644
--- a/maro/rl/policy/ac.py
+++ b/maro/rl/policy/ac.py
@@ -103,7 +103,7 @@ def __init__(
         critic_loss_cls="mse",
         min_logp: float = None,
         critic_loss_coeff: float = 1.0,
-        entropy_coeff: float = None,
+        entropy_coeff: float = .0,
         clip_ratio: float = None,
         lam: float = 0.9,
         max_trajectory_len: int = 10000,
@@ -203,7 +203,7 @@ def get_batch_loss(self, batch: dict, explicit_grad: bool = False):
         # critic_loss
         critic_loss = self.critic_loss_func(state_values, returns)
         # entropy
-        entropy = -Categorical(action_probs).entropy().mean() if self.entropy_coeff is not None else 0
+        entropy = -Categorical(action_probs).entropy().mean() if self.entropy_coeff else 0
 
         # total loss
         loss = actor_loss + self.critic_loss_coeff * critic_loss + self.entropy_coeff * entropy
@@ -211,7 +211,7 @@ def get_batch_loss(self, batch: dict, explicit_grad: bool = False):
         loss_info = {
             "actor_loss": actor_loss.detach().cpu().numpy(),
             "critic_loss": critic_loss.detach().cpu().numpy(),
-            "entropy": entropy.detach().cpu().numpy(),
+            "entropy": entropy.detach().cpu().numpy() if self.entropy_coeff else .0,
             "loss": loss.detach().cpu().numpy() if explicit_grad else loss
         }
         if explicit_grad:

From 0880528e5c2d1673a4769f36a6bc349aeb67c549 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Sun, 12 Sep 2021 09:08:12 +0000
Subject: [PATCH 20/29] removed unwanted files from cim dir

---
 examples/rl/cim/ac.py | 106 ------------------------------------------
 1 file changed, 106 deletions(-)
 delete mode 100644 examples/rl/cim/ac.py

diff --git a/examples/rl/cim/ac.py b/examples/rl/cim/ac.py
deleted file mode 100644
index 71ccf9b01..000000000
--- a/examples/rl/cim/ac.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-import numpy as np
-import torch
-
-from maro.rl.experience import ExperienceStore, UniformSampler
-from maro.rl.model import DiscreteACNet, FullyConnectedBlock, OptimOption
-from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, cim_path)
-from env_wrapper import STATE_DIM, env_config
-
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "hidden_dims": [256, 128, 64],
-                "output_dim": env_config["wrapper"]["num_actions"],
-                "activation": "tanh",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "hidden_dims": [256, 128, 64],
-                "output_dim": 1,
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": True,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.001}
-            },
-            "critic": {
-                "optim_cls": "rmsprop",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": .0,
-        "critic_loss_cls": "smooth_l1",
-        "train_epochs": 10,
-        "critic_loss_coeff": 0.1,
-        "entropy_coeff": 0.01,
-        # "clip_ratio": 0.8   # for PPO
-    },
-    "experience_store": {
-        "rollout": {"capacity": 1000, "overwrite_type": "rolling"},
-        "update": {"capacity": 100000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
-}
-
-
-def get_ac_policy(mode="update"):
-    assert mode in {"inference", "update", "inference-update"}
-    class MyACNET(DiscreteACNet):
-        def forward(self, states, actor: bool = True, critic: bool = True):
-            states = torch.from_numpy(np.asarray(states))
-            if len(states.shape) == 1:
-                states = states.unsqueeze(dim=0)
-
-            states = states.to(self.device)
-            return (
-                self.component["actor"](states) if actor else None,
-                self.component["critic"](states) if critic else None
-            )
-
-    ac_net = MyACNET(
-        component={
-            "actor": FullyConnectedBlock(**config["model"]["network"]["actor"]),
-            "critic": FullyConnectedBlock(**config["model"]["network"]["critic"])
-        },
-        optim_option={
-            "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
-            "critic": OptimOption(**config["model"]["optimization"]["critic"])
-        } if mode != "inference" else None
-    )
-
-    if mode == "update":
-        exp_store = ExperienceStore(**config["experience_store"]["update"])
-        experience_sampler_kwargs = config["sampler"]["update"]
-    else:
-        exp_store = ExperienceStore(**config["experience_store"]["rollout" if mode == "inference" else "update"])
-        experience_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
-
-    return ActorCritic(
-        ac_net, ActorCriticConfig(**config["algorithm"]), exp_store,
-        experience_sampler_cls=UniformSampler,
-        experience_sampler_kwargs=experience_sampler_kwargs
-    )

From 639c33fcf27e363611ae56c507839faaf20489f1 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Sun, 12 Sep 2021 09:16:41 +0000
Subject: [PATCH 21/29] reverted to simple policy manager as default

---
 examples/rl/workflows/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/rl/workflows/config.yml b/examples/rl/workflows/config.yml
index d97b89e65..6d5f14ac2 100644
--- a/examples/rl/workflows/config.yml
+++ b/examples/rl/workflows/config.yml
@@ -26,7 +26,7 @@ async:
 policy_manager:
   type: simple   # simple, distributed
   simple:
-    parallel: true
+    parallel: false
   distributed:
     group: learn
     num_hosts: 2

From c4e70ec6a74862033da8868af97f5daba02add81 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 09:24:35 +0000
Subject: [PATCH 22/29] added part of vm scheduling RL code

---
 examples/rl/vm_scheduling/__init__.py      |  11 --
 examples/rl/vm_scheduling/ac.py            | 103 -----------
 examples/rl/vm_scheduling/agent_wrapper.py |  35 ++++
 examples/rl/vm_scheduling/env_wrapper.py   | 191 ---------------------
 examples/rl/vm_scheduling/policy_index.py  |  19 --
 5 files changed, 35 insertions(+), 324 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/__init__.py
 delete mode 100644 examples/rl/vm_scheduling/ac.py
 create mode 100644 examples/rl/vm_scheduling/agent_wrapper.py
 delete mode 100644 examples/rl/vm_scheduling/env_wrapper.py
 delete mode 100644 examples/rl/vm_scheduling/policy_index.py

diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
deleted file mode 100644
index 44af25424..000000000
--- a/examples/rl/vm_scheduling/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .callbacks import post_collect, post_evaluate
-from .env_wrapper import get_env_sampler, get_test_env_wrapper
-from .policy_index import agent2policy, rl_policy_func_index, update_trigger, warmup
-
-__all__ = [
-    "agent2policy", "post_collect", "post_evaluate", "get_env_sampler", "get_test_env_wrapper",
-    "rl_policy_func_index", "update_trigger", "warmup"
-]
\ No newline at end of file
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
deleted file mode 100644
index 34eacf69d..000000000
--- a/examples/rl/vm_scheduling/ac.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-import numpy as np
-import torch
-
-from maro.rl.experience import ReplayMemory, UniformSampler
-from maro.rl.model import DiscreteACNet, FullyConnected, OptimOption
-from maro.rl.policy.algorithms import ActorCritic, ActorCriticConfig
-
-vm_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, vm_path)
-from env_wrapper import NUM_PMS, STATE_DIM
-
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-                "hidden_dims": [64, 32, 32],
-                "activation": "leaky_relu",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "output_dim": 1,
-                "hidden_dims": [256, 128, 64],
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": False,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.0001}
-            },
-            "critic": {
-                "optim_cls": "sgd",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "train_epochs": 100,
-        "critic_loss_cls": "mse",
-        "critic_loss_coeff": 0.1
-    },
-    "replay_memory": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
-}
-
-
-def get_ac_policy(mode="update"):
-    class MyACNet(DiscreteACNet):
-        def forward(self, states, actor: bool = True, critic: bool = True):
-            if isinstance(states, dict):
-                states = [states]
-            inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
-            masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
-            if len(inputs.shape) == 1:
-                inputs = inputs.unsqueeze(dim=0)
-            return (
-                self.component["actor"](inputs) * masks if actor else None,
-                self.component["critic"](inputs) if critic else None
-            )
-
-    ac_net = MyACNet(
-        component={
-            "actor": FullyConnected(**config["model"]["network"]["actor"]),
-            "critic": FullyConnected(**config["model"]["network"]["critic"])
-        },
-        optim_option={
-            "actor":  OptimOption(**config["model"]["optimization"]["actor"]),
-            "critic": OptimOption(**config["model"]["optimization"]["critic"])
-        } if mode != "inference" else None
-    )
-    if mode == "update":
-        exp_store = ReplayMemory(**config["replay_memory"]["update"])
-        exp_sampler_kwargs = config["sampler"]["update"]
-    else:
-        exp_store = ReplayMemory(**config["replay_memory"]["rollout" if mode == "inference" else "update"])
-        exp_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
-
-    return ActorCritic(
-        ac_net, ActorCriticConfig(**config["algorithm"]), exp_store,
-        experience_sampler_cls=UniformSampler,
-        experience_sampler_kwargs=exp_sampler_kwargs
-    )
diff --git a/examples/rl/vm_scheduling/agent_wrapper.py b/examples/rl/vm_scheduling/agent_wrapper.py
new file mode 100644
index 000000000..27e23788a
--- /dev/null
+++ b/examples/rl/vm_scheduling/agent_wrapper.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+
+from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
+from maro.rl.learning import AgentWrapper
+
+cim_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, cim_path)
+from env_wrapper import AGENT_IDS, env_config
+from policy_index import policy_func_index
+
+
+exploration_config = {
+    "last_ep": 10,
+    "initial_value": 0.4,
+    "final_value": 0.0,
+    "splits": [(5, 0.32)]
+}
+
+def get_agent_wrapper():
+    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
+    epsilon_greedy.register_schedule(
+        scheduler_cls=MultiPhaseLinearExplorationScheduler,
+        param_name="epsilon",
+        **exploration_config
+    )
+    return AgentWrapper(
+        {name: func(learning=False) for name, func in policy_func_index.items()},
+        {name: name for name in AGENT_IDS},
+        exploration_dict={f"EpsilonGreedy": epsilon_greedy},
+        agent2exploration={name: "EpsilonGreedy" for name in AGENT_IDS}
+    )
diff --git a/examples/rl/vm_scheduling/env_wrapper.py b/examples/rl/vm_scheduling/env_wrapper.py
deleted file mode 100644
index 0e2d714e7..000000000
--- a/examples/rl/vm_scheduling/env_wrapper.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import numpy as np
-
-from maro.rl.learning import AbsEnvWrapper, Transition
-from maro.simulator import Env
-from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
-
-def post_step(env: Env, tracker: dict, transition: Transition):
-    tracker["env_metric"] = env.metrics
-    if "vm_cpu_cores_requirement" not in tracker:
-        tracker["vm_core_requirement"] = []
-    if "action_sequence" not in tracker:
-        tracker["action_sequence"] = []
-
-    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
-    tracker["action_sequence"].append(transition.action["AGENT"])
-
-
-class VMEnvWrapper(AbsEnvWrapper):
-    def __init__(
-        self,
-        env: Env,
-        pm_attributes: list,
-        vm_attributes: list,
-        alpha: float,
-        beta: float,
-        pm_window_size: int = 1,
-        gamma: float = 0.0,
-        reward_eval_delay: int = 0
-    ):
-        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
-        self._pm_attributes = pm_attributes
-        self._vm_attributes = vm_attributes
-        self._st = 0
-        self._pm_window_size = pm_window_size
-        # adjust the ratio of the success allocation and the total income when computing the reward
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma # reward discount
-        self._num_pms = self.env._business_engine._pm_amount # the number of pms
-        self._durations = self.env._business_engine._max_tick
-        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
-        self._legal_pm_mask = None
-        self._state_dim = 2 * self._num_pms * pm_window_size + 4
-
-    @property
-    def state_dim(self):
-        return self._state_dim
-
-    @property
-    def num_pms(self):
-        return self._num_pms
-
-    def get_state(self, tick=None):
-        pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
-        # get the legal number of PM.
-        legal_pm_mask = np.zeros(self._num_pms + 1)
-        if len(self._event.valid_pms) <= 0:
-            # no pm available
-            legal_pm_mask[self._num_pms] = 1
-        else:
-            legal_pm_mask[self._num_pms] = 1
-            remain_cpu_dict = dict()
-            for pm in self._event.valid_pms:
-                # if two pm has same remaining cpu, only choose the one which has smaller id
-                if pm_state[-1, pm, 0] not in remain_cpu_dict:
-                    remain_cpu_dict[pm_state[-1, pm, 0]] = 1
-                    legal_pm_mask[pm] = 1
-                else:
-                    legal_pm_mask[pm] = 0
-
-        self._legal_pm_mask = legal_pm_mask
-        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
-
-    def to_env_action(self, action_info):
-        action_info = action_info["AGENT"]
-        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
-        if model_action == self._num_pms:
-            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
-        else:
-            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
-
-    def get_reward(self, actions, tick=None):
-        if isinstance(actions, PostponeAction):   # postponement
-            if np.sum(self._legal_pm_mask) != 1:
-                reward = -0.1 * self._alpha + 0.0 * self._beta
-            else:
-                reward = 0.0 * self._alpha + 0.0 * self._beta
-        elif self._event:
-            vm_unit_price = self.env._business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
-            )
-            reward = (
-                1.0 * self._alpha + self._beta * vm_unit_price *
-                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
-            )
-        else:
-            reward = .0
-        return {"AGENT": np.float32(reward)}
-
-    def _get_pm_state(self):
-        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
-        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
-
-        # normalize the attributes of pms' cpu and memory
-        self._max_cpu_capacity = np.max(total_pm_info[:, 0])
-        self._max_memory_capacity = np.max(total_pm_info[:, 1])
-        total_pm_info[:, 2] /= self._max_cpu_capacity
-        total_pm_info[:, 3] /= self._max_memory_capacity
-
-        # get the remaining cpu and memory of the pms
-        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
-        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
-
-        # get the pms' information
-        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
-
-        # get the sequence pms' information
-        self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
-
-    def _get_vm_state(self):
-        vm_info = np.array([
-            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
-            self._event.vm_memory_requirement / self._max_memory_capacity,
-            (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
-            self.env._business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
-            )
-        ], dtype=np.float32)
-        return vm_info
-
-
-env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k",
-        "start_tick": 0,
-        "durations": 300,  # 8638
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 666
-}
-
-
-test_env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k.oversubscription",
-        "start_tick": 0,
-        "durations": 300,
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 1024
-}
-
-
-def get_env_sampler(replay_agent_ids=None):
-    env = Env(**env_config["basic"])
-    env.set_seed(env_config["seed"])
-    return VMEnvWrapper(env, **env_config["wrapper"])
-
-
-def get_test_env_wrapper():
-    test_env = Env(**test_env_config["basic"])
-    test_env.set_seed(test_env_config["seed"])
-    return VMEnvWrapper(test_env, **test_env_config["wrapper"])
-
-
-tmp_env_wrapper = get_env_sampler()
-STATE_DIM = tmp_env_wrapper.state_dim
-NUM_PMS = tmp_env_wrapper.num_pms
-del tmp_env_wrapper
diff --git a/examples/rl/vm_scheduling/policy_index.py b/examples/rl/vm_scheduling/policy_index.py
deleted file mode 100644
index d9f3dad26..000000000
--- a/examples/rl/vm_scheduling/policy_index.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-if cim_path not in sys.path:
-    sys.path.insert(0, cim_path)
-from ac import get_ac_policy
-from dqn import get_dqn_policy
-
-update_trigger = {"POLICY": 128}
-warmup = {"POLICY": 1}
-
-# use agent IDs as policy names since each agent uses a separate policy
-rl_policy_func_index = {"POLICY": get_ac_policy}
-agent2policy = {"AGENT": "POLICY"}

From 975119ec240aba1c6a34dac75e5cb8e2f5a57fcc Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Fri, 9 Jul 2021 11:11:48 +0000
Subject: [PATCH 23/29] refined vm env_wrapper code style

---
 examples/rl/vm_scheduling/agent_wrapper.py |  35 --------
 examples/rl/vm_scheduling/config.py        | 100 +++++++++++++++++++++
 examples/rl/vm_scheduling/policies.py      |   0
 3 files changed, 100 insertions(+), 35 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/agent_wrapper.py
 create mode 100644 examples/rl/vm_scheduling/config.py
 create mode 100644 examples/rl/vm_scheduling/policies.py

diff --git a/examples/rl/vm_scheduling/agent_wrapper.py b/examples/rl/vm_scheduling/agent_wrapper.py
deleted file mode 100644
index 27e23788a..000000000
--- a/examples/rl/vm_scheduling/agent_wrapper.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-from maro.rl.exploration import EpsilonGreedyExploration, MultiPhaseLinearExplorationScheduler
-from maro.rl.learning import AgentWrapper
-
-cim_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, cim_path)
-from env_wrapper import AGENT_IDS, env_config
-from policy_index import policy_func_index
-
-
-exploration_config = {
-    "last_ep": 10,
-    "initial_value": 0.4,
-    "final_value": 0.0,
-    "splits": [(5, 0.32)]
-}
-
-def get_agent_wrapper():
-    epsilon_greedy = EpsilonGreedyExploration(num_actions=env_config["wrapper"]["num_actions"])
-    epsilon_greedy.register_schedule(
-        scheduler_cls=MultiPhaseLinearExplorationScheduler,
-        param_name="epsilon",
-        **exploration_config
-    )
-    return AgentWrapper(
-        {name: func(learning=False) for name, func in policy_func_index.items()},
-        {name: name for name in AGENT_IDS},
-        exploration_dict={f"EpsilonGreedy": epsilon_greedy},
-        agent2exploration={name: "EpsilonGreedy" for name in AGENT_IDS}
-    )
diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
new file mode 100644
index 000000000..629ed65dd
--- /dev/null
+++ b/examples/rl/vm_scheduling/config.py
@@ -0,0 +1,100 @@
+
+config = {
+    "model": {
+        "network": {
+            "actor": {
+                "input_dim": STATE_DIM,
+                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+                "hidden_dims": [64, 32, 32],
+                "activation": "leaky_relu",
+                "softmax": True,
+                "batch_norm": False,
+                "head": True
+            },
+            "critic": {
+                "input_dim": STATE_DIM,
+                "output_dim": 1,
+                "hidden_dims": [256, 128, 64],
+                "activation": "leaky_relu",
+                "softmax": False,
+                "batch_norm": False,
+                "head": True
+            }
+        },
+        "optimization": {
+            "actor": {
+                "optim_cls": "adam",
+                "optim_params": {"lr": 0.0001}
+            },
+            "critic": {
+                "optim_cls": "sgd",
+                "optim_params": {"lr": 0.001}
+            }
+        }
+    },
+    "algorithm": {
+        "reward_discount": 0.9,
+        "train_epochs": 100,
+        "critic_loss_cls": "mse",
+        "critic_loss_coeff": 0.1
+    },
+    "experience_store": {
+        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
+        "update": {"capacity": 50000, "overwrite_type": "rolling"}
+    },
+    "sampler": {
+        "rollout": {"batch_size": -1, "replace": False},
+        "update": {"batch_size": 128, "replace": True}
+    }
+}
+
+
+config = {
+    "model": {
+        "network": {
+            "actor": {
+                "input_dim": STATE_DIM,
+                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+                "hidden_dims": [64, 32, 32],
+                "activation": "leaky_relu",
+                "softmax": True,
+                "batch_norm": False,
+                "head": True
+            },
+            "critic": {
+                "input_dim": STATE_DIM,
+                "output_dim": 1,
+                "hidden_dims": [256, 128, 64],
+                "activation": "leaky_relu",
+                "softmax": False,
+                "batch_norm": False,
+                "head": True
+            }
+        },
+        "optimization": {
+            "actor": {
+                "optim_cls": "adam",
+                "optim_params": {"lr": 0.0001}
+            },
+            "critic": {
+                "optim_cls": "sgd",
+                "optim_params": {"lr": 0.001}
+            }
+        }
+    },
+    "algorithm": {
+        "reward_discount": 0.9,
+        "train_epochs": 100,
+        "critic_loss_cls": "mse",
+        "critic_loss_coeff": 0.1
+    },
+    "experience_store": {
+        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
+        "update": {"capacity": 50000, "overwrite_type": "rolling"}
+    },
+    "sampler": {
+        "rollout": {"batch_size": -1, "replace": False},
+        "update": {"batch_size": 128, "replace": True}
+    }
+}
+
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
new file mode 100644
index 000000000..e69de29bb

From 66c32e9566a5c3333880e991416b113f3b1c1c93 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Mon, 19 Jul 2021 08:29:11 +0000
Subject: [PATCH 24/29] vm scheduling RL code finished

---
 examples/rl/vm_scheduling/callbacks.py |  87 -----------------
 examples/rl/vm_scheduling/dqn.py       | 124 -------------------------
 2 files changed, 211 deletions(-)
 delete mode 100644 examples/rl/vm_scheduling/callbacks.py
 delete mode 100644 examples/rl/vm_scheduling/dqn.py

diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
deleted file mode 100644
index f82291374..000000000
--- a/examples/rl/vm_scheduling/callbacks.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import time
-from os import makedirs
-from os.path import dirname, join, realpath
-
-import matplotlib.pyplot as plt
-
-from maro.utils import Logger
-
-timestamp = str(time.time())
-
-log_dir = join(dirname(realpath(__file__)), "log", timestamp)
-makedirs(log_dir, exist_ok=True)
-
-plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
-makedirs(plt_path, exist_ok=True)
-
-
-simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
-
-def post_collect(trackers, ep, segment):
-    # print the env metric from each rollout worker
-    for tracker in trackers:
-        simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
-
-    # print the average env metric
-    if len(trackers) > 1:
-        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
-
-def post_evaluate(trackers, ep):
-    # print the env metric from each rollout worker
-    for tracker in trackers:
-        simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
-
-    # print the average env metric
-    if len(trackers) > 1:
-        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
-        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
-
-    for i, tracker in enumerate(trackers):
-        core_requirement = tracker["vm_core_requirement"]
-        action_sequence = tracker["action_sequence"]
-        # plot action sequence
-        fig = plt.figure(figsize=(40, 32))
-        ax = fig.add_subplot(1, 1, 1)
-        ax.plot(action_sequence)
-        fig.savefig(f"{plt_path}/action_sequence_{ep}")
-        plt.cla()
-        plt.close("all")
-
-        # plot with legal action mask
-        fig = plt.figure(figsize=(40, 32))
-        for idx, key in enumerate(core_requirement.keys()):
-            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-            for i in range(len(core_requirement[key])):
-                if i == 0:
-                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
-                    ax.legend()
-                else:
-                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
-
-        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
-
-        plt.cla()
-        plt.close("all")
-
-        # plot without legal actin mask
-        fig = plt.figure(figsize=(40, 32))
-
-        for idx, key in enumerate(core_requirement.keys()):
-            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
-            for i in range(len(core_requirement[key])):
-                if i == 0:
-                    ax.plot(core_requirement[key][i][0], label=str(key))
-                    ax.legend()
-                else:
-                    ax.plot(core_requirement[key][i][0])
-
-        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
-
-        plt.cla()
-        plt.close("all")
diff --git a/examples/rl/vm_scheduling/dqn.py b/examples/rl/vm_scheduling/dqn.py
deleted file mode 100644
index 4f7499779..000000000
--- a/examples/rl/vm_scheduling/dqn.py
+++ /dev/null
@@ -1,124 +0,0 @@
-  
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import os
-import sys
-
-import numpy as np
-import torch
-
-from maro.rl.experience import ReplayMemory, UniformSampler
-from maro.rl.exploration import DiscreteSpaceExploration, MultiLinearExplorationScheduler
-from maro.rl.model import DiscreteQNet, FullyConnected, OptimOption
-from maro.rl.policy.algorithms import DQN, DQNConfig
-
-vm_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, vm_path)
-from env_wrapper import NUM_PMS, STATE_DIM
-
-config = {
-    "model": {
-        "network": {
-            "input_dim": STATE_DIM,
-            "hidden_dims": [64, 128, 256],
-            "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-            "activation": "leaky_relu",
-            "softmax": False,
-            "batch_norm": False,
-            "skip_connection": False,
-            "head": True,
-            "dropout_p": 0.0
-        },
-        "optimization": {
-            "optim_cls": "sgd",
-            "optim_params": {"lr": 0.0005},
-            "scheduler_cls": "cosine_annealing_warm_restarts",
-            "scheduler_params": {"T_0": 500, "T_mult": 2}
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "update_target_every": 5,
-        "train_epochs": 100,
-        "soft_update_coeff": 0.1,
-        "double": False
-    },
-    "replay_memory": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 256, "replace": True}
-    },
-    "exploration": {
-        "last_ep": 400,
-        "initial_value": 0.4,
-        "final_value": 0.0,
-        "splits": [(100, 0.32)]
-    }
-}
-
-
-class MyQNet(DiscreteQNet):
-    def __init__(self, component, optim_option, device: str = None):
-        super().__init__(component, optim_option=optim_option, device=device)
-        for mdl in self.modules():
-            if isinstance(mdl, torch.nn.Linear):
-                torch.nn.init.xavier_uniform_(mdl.weight, gain=torch.nn.init.calculate_gain('leaky_relu'))
-
-    def forward(self, states):
-        if isinstance(states, dict):
-            states = [states]
-        inputs = torch.from_numpy(np.asarray([st["model"] for st in states])).to(self.device)
-        masks = torch.from_numpy(np.asarray([st["mask"] for st in states])).to(self.device)
-        if len(inputs.shape) == 1:
-            inputs = inputs.unsqueeze(dim=0)
-        q_for_all_actions = self.component(inputs)
-        return q_for_all_actions + (masks - 1) * 1e8
-
-
-class MaskedEpsilonGreedy(DiscreteSpaceExploration):
-    def __init__(self, epsilon: float = .0):
-        super().__init__()
-        self.epsilon = epsilon
-
-    def __call__(self, action, state):
-        if isinstance(state, dict):
-            state = [state]
-        mask = [st["mask"] for st in state]
-        return np.array([
-            act if np.random.random() > self.epsilon else np.random.choice(np.where(mk == 1)[0])
-            for act, mk in zip(action, mask)
-        ])
-
-
-def get_dqn_policy(mode="update"):
-    assert mode in {"inference", "update", "inference-update"}
-    q_net = MyQNet(
-        FullyConnected(**config["model"]["network"]),
-        optim_option=OptimOption(**config["model"]["optimization"]) if mode != "inference" else None
-    )
-
-    if mode == "update":
-        exp_store = ReplayMemory(**config["replay_memory"]["update"])
-        exploration = None
-        exp_sampler_kwargs = config["sampler"]["update"]
-    else:
-        exp_store = ReplayMemory(**config["replay_memory"]["rollout"])
-        exploration = MaskedEpsilonGreedy()
-        exploration.register_schedule(
-            scheduler_cls=MultiLinearExplorationScheduler,
-            param_name="epsilon",
-            **config["exploration"]
-        )
-        exp_store = ReplayMemory(**config["replay_memory"]["rollout" if mode == "inference" else "update"])
-        exp_sampler_kwargs = config["sampler"]["rollout" if mode == "inference" else "update"]
-
-    return DQN(
-        q_net, DQNConfig(**config["algorithm"]), exp_store,
-        experience_sampler_cls=UniformSampler,
-        experience_sampler_kwargs=exp_sampler_kwargs,
-        exploration=exploration
-    )

From f14ec813de2bceaeb1d5503984232a0b3400bee8 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Tue, 7 Sep 2021 07:22:18 +0000
Subject: [PATCH 25/29] added config.py for vm scheduing

---
 examples/rl/vm_scheduling/config.py      | 119 ++++++++------
 examples/rl/vm_scheduling/env_sampler.py | 192 +++++++++++++++++++++++
 2 files changed, 264 insertions(+), 47 deletions(-)
 create mode 100644 examples/rl/vm_scheduling/env_sampler.py

diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
index 629ed65dd..26b594ed9 100644
--- a/examples/rl/vm_scheduling/config.py
+++ b/examples/rl/vm_scheduling/config.py
@@ -1,51 +1,76 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
 
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-                "hidden_dims": [64, 32, 32],
-                "activation": "leaky_relu",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "output_dim": 1,
-                "hidden_dims": [256, 128, 64],
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": False,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.0001}
-            },
-            "critic": {
-                "optim_cls": "sgd",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "train_epochs": 100,
-        "critic_loss_cls": "mse",
-        "critic_loss_coeff": 0.1
-    },
-    "experience_store": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
+import torch
+from torch.optim import Adam, SGD
+
+
+env_conf = {
+    "scenario": "vm_scheduling",
+    "topology": "azure.2019.10k",
+    "start_tick": 0,
+    "durations": 300,  # 8638
+    "snapshot_resolution": 1
+}
+
+pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+        
+shaping_conf = {
+    "alpha": 0.0,
+    "beta": 1.0,
+    "pm_window_size": 1,
+    "gamma": 0.9,
+    "seed": 666
+}
+
+
+eval_env_conf = {
+    "scenario": "vm_scheduling",
+    "topology": "azure.2019.10k.oversubscription",
+    "start_tick": 0,
+    "durations": 300,
+    "snapshot_resolution": 1
+}
+
+eval_shaping_conf = {
+    "alpha": 0.0,
+    "beta": 1.0,
+    "pm_window_size": 1,
+    "gamma": 0.9,
+    "seed": 1024
+}
+
+actor_net_conf = {
+    "input_dim": STATE_DIM,
+    "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+    "hidden_dims": [64, 32, 32],
+    "activation": torch.nn.LeakyReLU,
+    "softmax": True,
+    "batch_norm": False,
+    "head": True
+}
+
+critic_net_conf = {
+    "input_dim": STATE_DIM,
+    "output_dim": 1,
+    "hidden_dims": [256, 128, 64],
+    "activation": "leaky_relu",
+    "softmax": False,
+    "batch_norm": False,
+    "head": True
+}
+
+actor_optim_conf = (Adam, {"lr": 0.0001})
+critic_optim_conf = (SGD, {"lr": 0.001})
+
+ac_conf = {
+    "reward_discount": 0.9,
+    "grad_iters": 100,
+    "critic_loss_cls": torch.nn.MSELoss,
+    "critic_loss_coeff": 0.1,
+    "max_trajectory_len": 10000,
+    "get_loss_on_rollout": False
 }
 
 
diff --git a/examples/rl/vm_scheduling/env_sampler.py b/examples/rl/vm_scheduling/env_sampler.py
new file mode 100644
index 000000000..929b4a264
--- /dev/null
+++ b/examples/rl/vm_scheduling/env_sampler.py
@@ -0,0 +1,192 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+
+from maro.rl.learning import AbsEnvSampler
+from maro.simulator import Env
+from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
+
+
+def post_step(env: Env, tracker: dict, transition):
+    tracker["env_metric"] = env.metrics
+    if "vm_cpu_cores_requirement" not in tracker:
+        tracker["vm_core_requirement"] = []
+    if "action_sequence" not in tracker:
+        tracker["action_sequence"] = []
+
+    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
+    tracker["action_sequence"].append(transition.action["AGENT"])
+
+
+class VMEnvSampler(AbsEnvSampler):
+    def __init__(
+        self,
+        env: Env,
+        pm_attributes: list,
+        vm_attributes: list,
+        alpha: float,
+        beta: float,
+        pm_window_size: int = 1,
+        gamma: float = 0.0,
+        reward_eval_delay: int = 0
+    ):
+        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
+        self._pm_attributes = pm_attributes
+        self._vm_attributes = vm_attributes
+        self._st = 0
+        self._pm_window_size = pm_window_size
+        # adjust the ratio of the success allocation and the total income when computing the reward
+        self._alpha = alpha
+        self._beta = beta
+        self._gamma = gamma  # reward discount
+        self._num_pms = self.env.business_engine._pm_amount # the number of pms
+        self._durations = self.env.business_engine._max_tick
+        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
+        self._legal_pm_mask = None
+        self._state_dim = 2 * self._num_pms * pm_window_size + 4
+
+    @property
+    def state_dim(self):
+        return self._state_dim
+
+    @property
+    def num_pms(self):
+        return self._num_pms
+
+    def get_state(self, tick=None):
+        pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
+        # get the legal number of PM.
+        legal_pm_mask = np.zeros(self._num_pms + 1)
+        if len(self._event.valid_pms) <= 0:
+            # no pm available
+            legal_pm_mask[self._num_pms] = 1
+        else:
+            legal_pm_mask[self._num_pms] = 1
+            remain_cpu_dict = dict()
+            for pm in self._event.valid_pms:
+                # if two pm has same remaining cpu, only choose the one which has smaller id
+                if pm_state[-1, pm, 0] not in remain_cpu_dict:
+                    remain_cpu_dict[pm_state[-1, pm, 0]] = 1
+                    legal_pm_mask[pm] = 1
+                else:
+                    legal_pm_mask[pm] = 0
+
+        self._legal_pm_mask = legal_pm_mask
+        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
+
+    def to_env_action(self, action_info):
+        action_info = action_info["AGENT"]
+        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
+        if model_action == self._num_pms:
+            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
+        else:
+            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
+
+    def get_reward(self, actions, tick=None):
+        if isinstance(actions, PostponeAction):   # postponement
+            if np.sum(self._legal_pm_mask) != 1:
+                reward = -0.1 * self._alpha + 0.0 * self._beta
+            else:
+                reward = 0.0 * self._alpha + 0.0 * self._beta
+        elif self._event:
+            vm_unit_price = self.env.business_engine._get_unit_price(
+                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+            )
+            reward = (
+                1.0 * self._alpha + self._beta * vm_unit_price *
+                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
+            )
+        else:
+            reward = .0
+        return {"AGENT": np.float32(reward)}
+
+    def _get_pm_state(self):
+        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
+        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
+
+        # normalize the attributes of pms' cpu and memory
+        self._max_cpu_capacity = np.max(total_pm_info[:, 0])
+        self._max_memory_capacity = np.max(total_pm_info[:, 1])
+        total_pm_info[:, 2] /= self._max_cpu_capacity
+        total_pm_info[:, 3] /= self._max_memory_capacity
+
+        # get the remaining cpu and memory of the pms
+        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
+        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
+
+        # get the pms' information
+        total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
+
+        # get the sequence pms' information
+        self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
+        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
+
+    def _get_vm_state(self):
+        vm_info = np.array([
+            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
+            self._event.vm_memory_requirement / self._max_memory_capacity,
+            (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
+            self.env.business_engine._get_unit_price(
+                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+            )
+        ], dtype=np.float32)
+        return vm_info
+
+
+env_config = {
+    "basic": {
+        "scenario": "vm_scheduling",
+        "topology": "azure.2019.10k",
+        "start_tick": 0,
+        "durations": 300,  # 8638
+        "snapshot_resolution": 1
+    },
+    "wrapper": {
+        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+        "alpha": 0.0,
+        "beta": 1.0,
+        "pm_window_size": 1,
+        "gamma": 0.9
+    },
+    "seed": 666
+}
+
+
+eval_env_config = {
+    "basic": {
+        "scenario": "vm_scheduling",
+        "topology": "azure.2019.10k.oversubscription",
+        "start_tick": 0,
+        "durations": 300,
+        "snapshot_resolution": 1
+    },
+    "wrapper": {
+        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
+        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+        "alpha": 0.0,
+        "beta": 1.0,
+        "pm_window_size": 1,
+        "gamma": 0.9
+    },
+    "seed": 1024
+}
+
+
+def get_env_wrapper(replay_agent_ids=None):
+    env = Env(**env_config["basic"])
+    env.set_seed(env_config["seed"])
+    return VMEnvWrapper(env, **env_config["wrapper"])
+
+
+def get_eval_env_wrapper():
+    eval_env = Env(**eval_env_config["basic"])
+    eval_env.set_seed(eval_env_config["seed"])
+    return VMEnvWrapper(eval_env, **eval_env_config["wrapper"])
+
+
+tmp_env_wrapper = get_env_wrapper()
+STATE_DIM = tmp_env_wrapper.state_dim
+NUM_PMS = tmp_env_wrapper.num_pms
+del tmp_env_wrapper
\ No newline at end of file

From c105be0a68db7b0cd5a116472ac49dbb413a99f6 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Wed, 22 Sep 2021 15:40:51 +0000
Subject: [PATCH 26/29] resolved rebase conflicts

---
 examples/rl/vm_scheduling/config.py      | 121 ++++++++--------
 examples/rl/vm_scheduling/env_sampler.py | 167 ++++++++---------------
 examples/rl/vm_scheduling/policies.py    | 107 +++++++++++++++
 examples/rl/workflows/config.yml         |   2 +-
 4 files changed, 222 insertions(+), 175 deletions(-)

diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
index 26b594ed9..5b44f0e65 100644
--- a/examples/rl/vm_scheduling/config.py
+++ b/examples/rl/vm_scheduling/config.py
@@ -2,7 +2,9 @@
 # Licensed under the MIT license.
 
 import torch
-from torch.optim import Adam, SGD
+from torch.optim import Adam, SGD, lr_scheduler
+
+from maro.simulator import Env
 
 
 env_conf = {
@@ -13,37 +15,38 @@
     "snapshot_resolution": 1
 }
 
+num_pms = Env(**env_conf).business_engine._pm_amount
+pm_window_size = 1
+state_dim = 2 * num_pms * pm_window_size + 4
+
 pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        
-shaping_conf = {
+# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+
+
+reward_shaping_conf = {
     "alpha": 0.0,
-    "beta": 1.0,
-    "pm_window_size": 1,
-    "gamma": 0.9,
-    "seed": 666
+    "beta": 1.0
 }
+seed = 666
 
-
-eval_env_conf = {
+test_env_conf = {
     "scenario": "vm_scheduling",
     "topology": "azure.2019.10k.oversubscription",
     "start_tick": 0,
     "durations": 300,
     "snapshot_resolution": 1
 }
-
-eval_shaping_conf = {
+test_reward_shaping_conf = {
     "alpha": 0.0,
-    "beta": 1.0,
-    "pm_window_size": 1,
-    "gamma": 0.9,
-    "seed": 1024
+    "beta": 1.0
 }
 
+test_seed = 1024
+
+######################################### A2C settings ########################################
 actor_net_conf = {
-    "input_dim": STATE_DIM,
-    "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
+    "input_dim": state_dim,
+    "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
     "hidden_dims": [64, 32, 32],
     "activation": torch.nn.LeakyReLU,
     "softmax": True,
@@ -52,10 +55,10 @@
 }
 
 critic_net_conf = {
-    "input_dim": STATE_DIM,
+    "input_dim": state_dim,
     "output_dim": 1,
     "hidden_dims": [256, 128, 64],
-    "activation": "leaky_relu",
+    "activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": False,
     "head": True
@@ -73,53 +76,37 @@
     "get_loss_on_rollout": False
 }
 
+######################################### DQN settings ########################################
+q_net_conf = {
+    "input_dim": state_dim,
+    "hidden_dims": [64, 128, 256],
+    "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
+    "activation": torch.nn.LeakyReLU,
+    "softmax": False,
+    "batch_norm": False,
+    "skip_connection": False,
+    "head": True,
+    "dropout_p": 0.0
+}
 
-config = {
-    "model": {
-        "network": {
-            "actor": {
-                "input_dim": STATE_DIM,
-                "output_dim": NUM_PMS + 1,  # action could be any PM or postponement, hence the plus 1
-                "hidden_dims": [64, 32, 32],
-                "activation": "leaky_relu",
-                "softmax": True,
-                "batch_norm": False,
-                "head": True
-            },
-            "critic": {
-                "input_dim": STATE_DIM,
-                "output_dim": 1,
-                "hidden_dims": [256, 128, 64],
-                "activation": "leaky_relu",
-                "softmax": False,
-                "batch_norm": False,
-                "head": True
-            }
-        },
-        "optimization": {
-            "actor": {
-                "optim_cls": "adam",
-                "optim_params": {"lr": 0.0001}
-            },
-            "critic": {
-                "optim_cls": "sgd",
-                "optim_params": {"lr": 0.001}
-            }
-        }
-    },
-    "algorithm": {
-        "reward_discount": 0.9,
-        "train_epochs": 100,
-        "critic_loss_cls": "mse",
-        "critic_loss_coeff": 0.1
-    },
-    "experience_store": {
-        "rollout": {"capacity": 10000, "overwrite_type": "rolling"},
-        "update": {"capacity": 50000, "overwrite_type": "rolling"}
-    },
-    "sampler": {
-        "rollout": {"batch_size": -1, "replace": False},
-        "update": {"batch_size": 128, "replace": True}
-    }
+q_net_optim_conf = (SGD, {"lr": 0.0005})
+q_net_lr_scheduler_conf = (lr_scheduler.CosineAnnealingWarmRestarts, {"T_0": 500, "T_mult": 2})
+
+dqn_conf = {
+    "reward_discount": 0.9,
+    "update_target_every": 5,
+    "train_epochs": 100,
+    "soft_update_coeff": 0.1,
+    "double": False,
+    "replay_memory_capacity": 10000,
+    "rollout_batch_size": 2560,
+    "train_batch_size": 256,
 }
 
+
+exploration_conf = {
+    "last_ep": 400,
+    "initial_value": 0.4,
+    "final_value": 0.0,
+    "splits": [(100, 0.32)]
+}
diff --git a/examples/rl/vm_scheduling/env_sampler.py b/examples/rl/vm_scheduling/env_sampler.py
index 929b4a264..c2282984c 100644
--- a/examples/rl/vm_scheduling/env_sampler.py
+++ b/examples/rl/vm_scheduling/env_sampler.py
@@ -1,12 +1,24 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import sys
+from os.path import dirname, realpath
+
 import numpy as np
 
+from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.rl.learning import AbsEnvSampler
 from maro.simulator import Env
 from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
 
+vm_path = dirname(realpath(__file__))
+sys.path.insert(0, vm_path)
+from config import (
+    env_conf, exploration_conf, pm_attributes, pm_window_size, reward_shaping_conf, seed, test_env_conf,
+    test_reward_shaping_conf, test_seed
+)
+from policies import policy_func_dict
+
 
 def post_step(env: Env, tracker: dict, transition):
     tracker["env_metric"] = env.metrics
@@ -22,49 +34,37 @@ def post_step(env: Env, tracker: dict, transition):
 class VMEnvSampler(AbsEnvSampler):
     def __init__(
         self,
-        env: Env,
-        pm_attributes: list,
-        vm_attributes: list,
-        alpha: float,
-        beta: float,
-        pm_window_size: int = 1,
-        gamma: float = 0.0,
-        reward_eval_delay: int = 0
+        get_env,
+        get_policy_func_dict,
+        exploration_scheduler_option,
+        agent2policy,
+        get_test_env=None,
+        post_step=None
     ):
-        super().__init__(env, reward_eval_delay=reward_eval_delay, replay_agent_ids=["AGENT"], post_step=post_step)
-        self._pm_attributes = pm_attributes
-        self._vm_attributes = vm_attributes
-        self._st = 0
-        self._pm_window_size = pm_window_size
+        super().__init__(
+            get_env, get_policy_func_dict, exploration_scheduler_option, agent2policy,
+            get_test_env=get_test_env, post_step=post_step
+        )
+        self._learn_env.set_seed(seed)
+        self._test_env.set_seed(test_seed)
+
         # adjust the ratio of the success allocation and the total income when computing the reward
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma  # reward discount
-        self._num_pms = self.env.business_engine._pm_amount # the number of pms
+        self.num_pms = self.env.business_engine._pm_amount # the number of pms
         self._durations = self.env.business_engine._max_tick
-        self._pm_state_history = np.zeros((pm_window_size - 1, self._num_pms, 2))
+        self._pm_state_history = np.zeros((pm_window_size - 1, self.num_pms, 2))
         self._legal_pm_mask = None
-        self._state_dim = 2 * self._num_pms * pm_window_size + 4
-
-    @property
-    def state_dim(self):
-        return self._state_dim
-
-    @property
-    def num_pms(self):
-        return self._num_pms
 
     def get_state(self, tick=None):
         pm_state, vm_state = self._get_pm_state(), self._get_vm_state()
         # get the legal number of PM.
-        legal_pm_mask = np.zeros(self._num_pms + 1)
-        if len(self._event.valid_pms) <= 0:
+        legal_pm_mask = np.zeros(self.num_pms + 1)
+        if len(self.event.valid_pms) <= 0:
             # no pm available
-            legal_pm_mask[self._num_pms] = 1
+            legal_pm_mask[self.num_pms] = 1
         else:
-            legal_pm_mask[self._num_pms] = 1
+            legal_pm_mask[self.num_pms] = 1
             remain_cpu_dict = dict()
-            for pm in self._event.valid_pms:
+            for pm in self.event.valid_pms:
                 # if two pm has same remaining cpu, only choose the one which has smaller id
                 if pm_state[-1, pm, 0] not in remain_cpu_dict:
                     remain_cpu_dict[pm_state[-1, pm, 0]] = 1
@@ -73,37 +73,38 @@ def get_state(self, tick=None):
                     legal_pm_mask[pm] = 0
 
         self._legal_pm_mask = legal_pm_mask
-        return {"AGENT": {"model": np.concatenate((pm_state.flatten(), vm_state.flatten())), "mask": legal_pm_mask}}
+        return {"AGENT": np.concatenate((pm_state.flatten(), vm_state.flatten(), legal_pm_mask))}
 
     def to_env_action(self, action_info):
         action_info = action_info["AGENT"]
         model_action = action_info[0] if isinstance(action_info, tuple) else action_info
-        if model_action == self._num_pms:
-            return PostponeAction(vm_id=self._event.vm_id, postpone_step=1)
+        if model_action == self.num_pms:
+            return PostponeAction(vm_id=self.event.vm_id, postpone_step=1)
         else:
-            return AllocateAction(vm_id=self._event.vm_id, pm_id=model_action)
+            return AllocateAction(vm_id=self.event.vm_id, pm_id=model_action)
 
-    def get_reward(self, actions, tick=None):
+    def get_reward(self, actions, tick):
+        conf = reward_shaping_conf if self.env == self._learn_env else test_reward_shaping_conf
         if isinstance(actions, PostponeAction):   # postponement
             if np.sum(self._legal_pm_mask) != 1:
-                reward = -0.1 * self._alpha + 0.0 * self._beta
+                reward = -0.1 * conf["alpha"] + 0.0 * conf["beta"]
             else:
-                reward = 0.0 * self._alpha + 0.0 * self._beta
-        elif self._event:
+                reward = 0.0 * conf["alpha"] + 0.0 * conf["beta"]
+        elif self.event:
             vm_unit_price = self.env.business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+                self.event.vm_cpu_cores_requirement, self.event.vm_memory_requirement
             )
             reward = (
-                1.0 * self._alpha + self._beta * vm_unit_price *
-                min(self._durations - self._event.frame_index, self._event.remaining_buffer_time)
+                1.0 * conf["alpha"] + conf["beta"] * vm_unit_price *
+                min(self._durations - self.event.frame_index, self.event.remaining_buffer_time)
             )
         else:
             reward = .0
         return {"AGENT": np.float32(reward)}
 
     def _get_pm_state(self):
-        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::self._pm_attributes]
-        total_pm_info = total_pm_info.reshape(self._num_pms, len(self._pm_attributes))
+        total_pm_info = self.env.snapshot_list["pms"][self.env.frame_index::pm_attributes]
+        total_pm_info = total_pm_info.reshape(self.num_pms, len(pm_attributes))
 
         # normalize the attributes of pms' cpu and memory
         self._max_cpu_capacity = np.max(total_pm_info[:, 0])
@@ -112,81 +113,33 @@ def _get_pm_state(self):
         total_pm_info[:, 3] /= self._max_memory_capacity
 
         # get the remaining cpu and memory of the pms
-        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self._num_pms, 1)
-        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self._num_pms, 1)
+        remain_cpu = (1 - total_pm_info[:, 2]).reshape(1, self.num_pms, 1)
+        remain_memory = (1 - total_pm_info[:, 3]).reshape(1, self.num_pms, 1)
 
         # get the pms' information
         total_pm_info = np.concatenate((remain_cpu, remain_memory), axis=2)  # (1, num_pms, 2)
 
         # get the sequence pms' information
         self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[-self._pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
+        return self._pm_state_history[pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
 
     def _get_vm_state(self):
         vm_info = np.array([
-            self._event.vm_cpu_cores_requirement / self._max_cpu_capacity,
-            self._event.vm_memory_requirement / self._max_memory_capacity,
+            self.event.vm_cpu_cores_requirement / self._max_cpu_capacity,
+            self.event.vm_memory_requirement / self._max_memory_capacity,
             (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
             self.env.business_engine._get_unit_price(
-                self._event.vm_cpu_cores_requirement, self._event.vm_memory_requirement
+                self.event.vm_cpu_cores_requirement, self.event.vm_memory_requirement
             )
         ], dtype=np.float32)
         return vm_info
 
 
-env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k",
-        "start_tick": 0,
-        "durations": 300,  # 8638
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 666
-}
-
-
-eval_env_config = {
-    "basic": {
-        "scenario": "vm_scheduling",
-        "topology": "azure.2019.10k.oversubscription",
-        "start_tick": 0,
-        "durations": 300,
-        "snapshot_resolution": 1
-    },
-    "wrapper": {
-        "pm_attributes": ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-        "vm_attributes": ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
-        "alpha": 0.0,
-        "beta": 1.0,
-        "pm_window_size": 1,
-        "gamma": 0.9
-    },
-    "seed": 1024
-}
-
-
-def get_env_wrapper(replay_agent_ids=None):
-    env = Env(**env_config["basic"])
-    env.set_seed(env_config["seed"])
-    return VMEnvWrapper(env, **env_config["wrapper"])
-
-
-def get_eval_env_wrapper():
-    eval_env = Env(**eval_env_config["basic"])
-    eval_env.set_seed(eval_env_config["seed"])
-    return VMEnvWrapper(eval_env, **eval_env_config["wrapper"])
-
-
-tmp_env_wrapper = get_env_wrapper()
-STATE_DIM = tmp_env_wrapper.state_dim
-NUM_PMS = tmp_env_wrapper.num_pms
-del tmp_env_wrapper
\ No newline at end of file
+def get_env_sampler():
+    return VMEnvSampler(
+        get_env=lambda: Env(**env_conf),
+        get_policy_func_dict=policy_func_dict,
+        exploration_scheduler_option={"dqn": {"epsilon": (MultiLinearExplorationScheduler, exploration_conf)}},
+        agent2policy={"AGENT": "dqn"},
+        get_test_env=lambda: Env(**test_env_conf)
+    )
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
index e69de29bb..25e109a65 100644
--- a/examples/rl/vm_scheduling/policies.py
+++ b/examples/rl/vm_scheduling/policies.py
@@ -0,0 +1,107 @@
+
+import sys
+from os.path import dirname, realpath
+
+import numpy as np
+import torch
+
+from maro.rl.modeling import DiscreteACNet, DiscreteQNet, FullyConnected
+from maro.rl.policy import DQN, ActorCritic
+
+vm_path = dirname(realpath(__file__))
+sys.path.insert(0, vm_path)
+from config import (
+    ac_conf, actor_net_conf, actor_optim_conf, critic_net_conf, critic_optim_conf, dqn_conf, q_net_conf,
+    q_net_optim_conf, state_dim
+)
+
+
+class MyQNet(DiscreteQNet):
+    def __init__(self):
+        super().__init__()
+        for mdl in self.modules():
+            if isinstance(mdl, torch.nn.Linear):
+                torch.nn.init.xavier_uniform_(mdl.weight, gain=torch.nn.init.calculate_gain('leaky_relu'))
+
+        self.fc = FullyConnected(**q_net_conf)
+        self.optim = q_net_optim_conf[0](self.fc.parameters(), **q_net_optim_conf[1])
+
+    @property
+    def input_dim(self):
+        return state_dim
+
+    @property
+    def num_actions(self):
+        return q_net_conf["output_dim"]
+
+    def forward(self, states):
+        inputs = states[:, :state_dim]
+        masks = states[:, state_dim:]
+        q_for_all_actions = self.fc(inputs)
+        return q_for_all_actions + (masks - 1) * 1e8
+
+    def step(self, loss):
+        self.optim.zero_grad()
+        loss.backward()
+        self.optim.step()
+
+    def get_gradients(self, loss):
+        self.optim.zero_grad()
+        loss.backward()
+        return {name: param.grad for name, param in self.named_parameters()}
+
+    def apply_gradients(self, grad):
+        for name, param in self.named_parameters():
+            param.grad = grad[name]
+
+        self.optim.step()
+
+
+def masked_eps_greedy(action, num_actions, state, *, epsilon):
+    mask = [st["mask"] for st in state]
+    return np.array([
+        act if np.random.random() > epsilon else np.random.choice(np.where(mk == 1)[0])
+        for act, mk in zip(action, mask)
+    ])
+
+
+class MyACNet(DiscreteACNet):
+    def __init__(self):
+        super().__init__()
+        self.actor = FullyConnected(**actor_net_conf)
+        self.critic = FullyConnected(**critic_net_conf)
+        self.actor_optim = actor_optim_conf[0](self.actor.parameters(), **actor_optim_conf[1])
+        self.critic_optim = critic_optim_conf[0](self.critic.parameters(), **critic_optim_conf[1])
+
+    @property
+    def input_dim(self):
+        return state_dim
+
+    def forward(self, states, actor: bool = True, critic: bool = True):
+        return (self.actor(states) if actor else None), (self.critic(states) if critic else None)
+
+    def step(self, loss):
+        self.actor_optim.zero_grad()
+        self.critic_optim.zero_grad()
+        loss.backward()
+        self.actor_optim.step()
+        self.critic_optim.step()
+
+    def get_gradients(self, loss):
+        self.actor_optim.zero_grad()
+        self.critic_optim.zero_grad()
+        loss.backward()
+        return {name: param.grad for name, param in self.named_parameters()}
+
+    def apply_gradients(self, grad):
+        for name, param in self.named_parameters():
+            param.grad = grad[name]
+
+        self.actor_optim.step()
+        self.critic_optim.step()
+
+
+policy_func_dict = {
+    "dqn": lambda name: DQN(name, MyQNet(), **dqn_conf),
+    "ac": lambda name: ActorCritic(name, MyACNet(), **ac_conf)
+}
diff --git a/examples/rl/workflows/config.yml b/examples/rl/workflows/config.yml
index 972e2b687..cf831283e 100644
--- a/examples/rl/workflows/config.yml
+++ b/examples/rl/workflows/config.yml
@@ -20,7 +20,7 @@ sync:
 async:
   num_rollouts: 3
 policy_manager:
-  type: multi-process   # simple, multi-process, distributed
+  type: simple   # simple, multi-process, distributed
   distributed:
     num_hosts: 2
 data_parallel:

From 2982ea4368a4ff0ee61f3983280bacb344c81e61 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Sun, 12 Sep 2021 09:05:08 +0000
Subject: [PATCH 27/29] fixed bugs in vm_scheduling

---
 docker_files/dev.df                      |  1 +
 examples/rl/vm_scheduling/__init__.py    |  8 +++
 examples/rl/vm_scheduling/callbacks.py   | 86 ++++++++++++++++++++++++
 examples/rl/vm_scheduling/config.py      | 43 +++++++-----
 examples/rl/vm_scheduling/env_sampler.py | 55 ++++++---------
 examples/rl/vm_scheduling/policies.py    | 37 ++++------
 6 files changed, 159 insertions(+), 71 deletions(-)
 create mode 100644 examples/rl/vm_scheduling/__init__.py
 create mode 100644 examples/rl/vm_scheduling/callbacks.py

diff --git a/docker_files/dev.df b/docker_files/dev.df
index f2f126eb5..4b08d2671 100644
--- a/docker_files/dev.df
+++ b/docker_files/dev.df
@@ -20,6 +20,7 @@ RUN pip install --no-cache-dir pyzmq==19.0.2
 RUN pip install --no-cache-dir numpy==1.19.1
 RUN pip install --no-cache-dir torch==1.6.0
 RUN pip install --no-cache-dir scipy
+RUN pip install --no-cache-dir matplotlib
 RUN pip install --no-cache-dir redis
 
 COPY maro /maro/maro
diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
new file mode 100644
index 000000000..280488bcc
--- /dev/null
+++ b/examples/rl/vm_scheduling/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .callbacks import post_collect, post_evaluate
+from .env_sampler import get_env_sampler
+from .policies import policy_func_dict
+
+__all__ = ["post_collect", "post_evaluate", "get_env_sampler", "policy_func_dict"]
diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
new file mode 100644
index 000000000..6793ae555
--- /dev/null
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -0,0 +1,86 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import time
+from os import makedirs
+from os.path import dirname, join, realpath
+
+from matplotlib import pyplot as plt
+
+from maro.utils import Logger
+
+timestamp = str(time.time())
+log_dir = join(dirname(realpath(__file__)), "log", timestamp)
+makedirs(log_dir, exist_ok=True)
+plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
+makedirs(plt_path, exist_ok=True)
+
+logger = Logger("SIMUALTION", dump_folder=log_dir)
+
+
+def post_collect(trackers, ep, segment):
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+
+
+def post_evaluate(trackers, ep):
+    # print the env metric from each rollout worker
+    for tracker in trackers:
+        logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+
+    # print the average env metric
+    if len(trackers) > 1:
+        metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
+        avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
+        logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+
+    for i, tracker in enumerate(trackers):
+        core_requirement = tracker["vm_core_requirement"]
+        action_sequence = tracker["action_sequence"]
+        # plot action sequence
+        fig = plt.figure(figsize=(40, 32))
+        ax = fig.add_subplot(1, 1, 1)
+        ax.plot(action_sequence)
+        fig.savefig(f"{plt_path}/action_sequence_{ep}")
+        plt.cla()
+        plt.close("all")
+
+        # plot with legal action mask
+        fig = plt.figure(figsize=(40, 32))
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0] * core_requirement[key][i][1])
+
+        fig.savefig(f"{plt_path}/values_with_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
+
+        # plot without legal actin mask
+        fig = plt.figure(figsize=(40, 32))
+
+        for idx, key in enumerate(core_requirement.keys()):
+            ax = fig.add_subplot(len(core_requirement.keys()), 1, idx + 1)
+            for i in range(len(core_requirement[key])):
+                if i == 0:
+                    ax.plot(core_requirement[key][i][0], label=str(key))
+                    ax.legend()
+                else:
+                    ax.plot(core_requirement[key][i][0])
+
+        fig.savefig(f"{plt_path}/values_without_legal_action_{ep}")
+
+        plt.cla()
+        plt.close("all")
diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
index 5b44f0e65..946cc1030 100644
--- a/examples/rl/vm_scheduling/config.py
+++ b/examples/rl/vm_scheduling/config.py
@@ -1,9 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import numpy as np
 import torch
 from torch.optim import Adam, SGD, lr_scheduler
 
+from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.simulator import Env
 
 
@@ -17,10 +19,10 @@
 
 num_pms = Env(**env_conf).business_engine._pm_amount
 pm_window_size = 1
-state_dim = 2 * num_pms * pm_window_size + 4
+num_features = 2 * num_pms * pm_window_size + 4
 
-pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"],
-# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"],
+pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"]
+# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"]
 
 
 reward_shaping_conf = {
@@ -43,9 +45,11 @@
 
 test_seed = 1024
 
+algorithm = "ac"  # "dqn" or "ac"
+
 ######################################### A2C settings ########################################
 actor_net_conf = {
-    "input_dim": state_dim,
+    "input_dim": num_features,
     "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
     "hidden_dims": [64, 32, 32],
     "activation": torch.nn.LeakyReLU,
@@ -55,7 +59,7 @@
 }
 
 critic_net_conf = {
-    "input_dim": state_dim,
+    "input_dim": num_features,
     "output_dim": 1,
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.LeakyReLU,
@@ -78,7 +82,7 @@
 
 ######################################### DQN settings ########################################
 q_net_conf = {
-    "input_dim": state_dim,
+    "input_dim": num_features,
     "hidden_dims": [64, 128, 256],
     "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
     "activation": torch.nn.LeakyReLU,
@@ -92,21 +96,30 @@
 q_net_optim_conf = (SGD, {"lr": 0.0005})
 q_net_lr_scheduler_conf = (lr_scheduler.CosineAnnealingWarmRestarts, {"T_0": 500, "T_mult": 2})
 
+
+def masked_eps_greedy(states, actions, num_actions, *, epsilon):
+    masks = states[:, num_features:]
+    return np.array([
+        action if np.random.random() > epsilon else np.random.choice(np.where(mask == 1)[0])
+        for action, mask in zip(actions, masks)
+    ])
+
 dqn_conf = {
     "reward_discount": 0.9,
     "update_target_every": 5,
-    "train_epochs": 100,
+    "num_epochs": 100,
     "soft_update_coeff": 0.1,
     "double": False,
+    "exploration_strategy": (masked_eps_greedy, {"epsilon": 0.4}),
+    "exploration_scheduling_options": [(
+        "epsilon", MultiLinearExplorationScheduler, {
+            "splits": [(100, 0.32)],
+            "initial_value": 0.4,
+            "last_ep": 400,
+            "final_value": 0.0,
+        }
+    )],
     "replay_memory_capacity": 10000,
     "rollout_batch_size": 2560,
     "train_batch_size": 256,
 }
-
-
-exploration_conf = {
-    "last_ep": 400,
-    "initial_value": 0.4,
-    "final_value": 0.0,
-    "splits": [(100, 0.32)]
-}
diff --git a/examples/rl/vm_scheduling/env_sampler.py b/examples/rl/vm_scheduling/env_sampler.py
index c2282984c..129b24837 100644
--- a/examples/rl/vm_scheduling/env_sampler.py
+++ b/examples/rl/vm_scheduling/env_sampler.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.rl.learning import AbsEnvSampler
 from maro.simulator import Env
 from maro.simulator.scenarios.vm_scheduling import AllocateAction, PostponeAction
@@ -14,43 +13,34 @@
 vm_path = dirname(realpath(__file__))
 sys.path.insert(0, vm_path)
 from config import (
-    env_conf, exploration_conf, pm_attributes, pm_window_size, reward_shaping_conf, seed, test_env_conf,
+    algorithm, env_conf, pm_attributes, pm_window_size, reward_shaping_conf, num_features, seed, test_env_conf,
     test_reward_shaping_conf, test_seed
 )
 from policies import policy_func_dict
 
 
-def post_step(env: Env, tracker: dict, transition):
-    tracker["env_metric"] = env.metrics
+def post_step(env, tracker: dict, state, action, env_actions, reward, tick):
+    tracker["env_metric"] = {key: metric for key, metric in env.metrics.items() if key != "total_latency"}
+    tracker["env_metric"]["latency_due_to_agent"] = env.metrics["total_latency"].due_to_agent
+    tracker["env_metric"]["latency_due_to_resource"] = env.metrics["total_latency"].due_to_resource
     if "vm_cpu_cores_requirement" not in tracker:
-        tracker["vm_core_requirement"] = []
+        tracker["vm_cpu_cores_requirement"] = []
     if "action_sequence" not in tracker:
         tracker["action_sequence"] = []
 
-    tracker["vm_core_requirement"].append([transition.action["AGENT"], transition.state["AGENT"]["mask"]])
-    tracker["action_sequence"].append(transition.action["AGENT"])
+    tracker["vm_cpu_cores_requirement"].append([action, state[num_features:]])
+    tracker["action_sequence"].append(action)
 
 
 class VMEnvSampler(AbsEnvSampler):
-    def __init__(
-        self,
-        get_env,
-        get_policy_func_dict,
-        exploration_scheduler_option,
-        agent2policy,
-        get_test_env=None,
-        post_step=None
-    ):
-        super().__init__(
-            get_env, get_policy_func_dict, exploration_scheduler_option, agent2policy,
-            get_test_env=get_test_env, post_step=post_step
-        )
+    def __init__(self, get_env, get_policy_func_dict, agent2policy, get_test_env=None, post_step=None):
+        super().__init__(get_env, get_policy_func_dict, agent2policy, get_test_env=get_test_env, post_step=post_step)
         self._learn_env.set_seed(seed)
         self._test_env.set_seed(test_seed)
 
         # adjust the ratio of the success allocation and the total income when computing the reward
-        self.num_pms = self.env.business_engine._pm_amount # the number of pms
-        self._durations = self.env.business_engine._max_tick
+        self.num_pms = self._learn_env.business_engine._pm_amount # the number of pms
+        self._durations = self._learn_env.business_engine._max_tick
         self._pm_state_history = np.zeros((pm_window_size - 1, self.num_pms, 2))
         self._legal_pm_mask = None
 
@@ -65,7 +55,7 @@ def get_state(self, tick=None):
             legal_pm_mask[self.num_pms] = 1
             remain_cpu_dict = dict()
             for pm in self.event.valid_pms:
-                # if two pm has same remaining cpu, only choose the one which has smaller id
+                # If two pms have the same remaining cpu, choose the one with the smaller id
                 if pm_state[-1, pm, 0] not in remain_cpu_dict:
                     remain_cpu_dict[pm_state[-1, pm, 0]] = 1
                     legal_pm_mask[pm] = 1
@@ -73,11 +63,11 @@ def get_state(self, tick=None):
                     legal_pm_mask[pm] = 0
 
         self._legal_pm_mask = legal_pm_mask
-        return {"AGENT": np.concatenate((pm_state.flatten(), vm_state.flatten(), legal_pm_mask))}
+        return {"AGENT": np.concatenate((pm_state.flatten(), vm_state.flatten(), legal_pm_mask)).astype(np.float32)}
 
-    def to_env_action(self, action_info):
+    def get_env_actions(self, action_info):
         action_info = action_info["AGENT"]
-        model_action = action_info[0] if isinstance(action_info, tuple) else action_info
+        model_action = action_info["action"] if isinstance(action_info, dict) else action_info
         if model_action == self.num_pms:
             return PostponeAction(vm_id=self.event.vm_id, postpone_step=1)
         else:
@@ -121,25 +111,24 @@ def _get_pm_state(self):
 
         # get the sequence pms' information
         self._pm_state_history = np.concatenate((self._pm_state_history, total_pm_info), axis=0)
-        return self._pm_state_history[pm_window_size:, :, :].astype(np.float32) # (win_size, num_pms, 2)
+        return self._pm_state_history[-pm_window_size:, :, :] # (win_size, num_pms, 2)
 
     def _get_vm_state(self):
-        vm_info = np.array([
+        return np.array([
             self.event.vm_cpu_cores_requirement / self._max_cpu_capacity,
             self.event.vm_memory_requirement / self._max_memory_capacity,
             (self._durations - self.env.tick) * 1.0 / 200,   # TODO: CHANGE 200 TO SOMETHING CONFIGURABLE
             self.env.business_engine._get_unit_price(
                 self.event.vm_cpu_cores_requirement, self.event.vm_memory_requirement
             )
-        ], dtype=np.float32)
-        return vm_info
+        ])
 
 
 def get_env_sampler():
     return VMEnvSampler(
         get_env=lambda: Env(**env_conf),
         get_policy_func_dict=policy_func_dict,
-        exploration_scheduler_option={"dqn": {"epsilon": (MultiLinearExplorationScheduler, exploration_conf)}},
-        agent2policy={"AGENT": "dqn"},
-        get_test_env=lambda: Env(**test_env_conf)
+        agent2policy={"AGENT": algorithm},
+        get_test_env=lambda: Env(**test_env_conf),
+        post_step=post_step
     )
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
index 25e109a65..92b79b72a 100644
--- a/examples/rl/vm_scheduling/policies.py
+++ b/examples/rl/vm_scheduling/policies.py
@@ -2,7 +2,6 @@
 import sys
 from os.path import dirname, realpath
 
-import numpy as np
 import torch
 
 from maro.rl.modeling import DiscreteACNet, DiscreteQNet, FullyConnected
@@ -11,8 +10,8 @@
 vm_path = dirname(realpath(__file__))
 sys.path.insert(0, vm_path)
 from config import (
-    ac_conf, actor_net_conf, actor_optim_conf, critic_net_conf, critic_optim_conf, dqn_conf, q_net_conf,
-    q_net_optim_conf, state_dim
+    ac_conf, actor_net_conf, actor_optim_conf, algorithm, critic_net_conf, critic_optim_conf, dqn_conf, q_net_conf,
+    num_features, num_pms, q_net_optim_conf
 )
 
 
@@ -28,16 +27,15 @@ def __init__(self):
 
     @property
     def input_dim(self):
-        return state_dim
+        return num_features + num_pms + 1
 
     @property
     def num_actions(self):
         return q_net_conf["output_dim"]
 
-    def forward(self, states):
-        inputs = states[:, :state_dim]
-        masks = states[:, state_dim:]
-        q_for_all_actions = self.fc(inputs)
+    def forward(self, states): 
+        masks = states[:, num_features:]
+        q_for_all_actions = self.fc(states[:, :num_features])
         return q_for_all_actions + (masks - 1) * 1e8
 
     def step(self, loss):
@@ -57,14 +55,6 @@ def apply_gradients(self, grad):
         self.optim.step()
 
 
-def masked_eps_greedy(action, num_actions, state, *, epsilon):
-    mask = [st["mask"] for st in state]
-    return np.array([
-        act if np.random.random() > epsilon else np.random.choice(np.where(mk == 1)[0])
-        for act, mk in zip(action, mask)
-    ])
-
-
 class MyACNet(DiscreteACNet):
     def __init__(self):
         super().__init__()
@@ -75,10 +65,12 @@ def __init__(self):
 
     @property
     def input_dim(self):
-        return state_dim
+        return num_features + num_pms + 1
 
     def forward(self, states, actor: bool = True, critic: bool = True):
-        return (self.actor(states) if actor else None), (self.critic(states) if critic else None)
+        features = states[:, :num_features].to()
+        masks = states[:, num_features:]
+        return (self.actor(features) * masks if actor else None), (self.critic(features) if critic else None)
 
     def step(self, loss):
         self.actor_optim.zero_grad()
@@ -100,8 +92,7 @@ def apply_gradients(self, grad):
         self.actor_optim.step()
         self.critic_optim.step()
 
-
-policy_func_dict = {
-    "dqn": lambda name: DQN(name, MyQNet(), **dqn_conf),
-    "ac": lambda name: ActorCritic(name, MyACNet(), **ac_conf)
-}
+if algorithm == "dqn":
+    policy_func_dict = {"dqn": lambda name: DQN(name, MyQNet(), **dqn_conf)}
+else:
+    policy_func_dict = {"ac": lambda name: ActorCritic(name, MyACNet(), **ac_conf)}

From db17d70bf6cc7228766e30cd901c7ec91dce8f8d Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Wed, 22 Sep 2021 15:47:12 +0000
Subject: [PATCH 28/29] added get_state and set_state to vm_scheduling policy
 models

---
 examples/rl/vm_scheduling/callbacks.py | 12 ++++--------
 examples/rl/vm_scheduling/policies.py  | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
index 6793ae555..e62184845 100644
--- a/examples/rl/vm_scheduling/callbacks.py
+++ b/examples/rl/vm_scheduling/callbacks.py
@@ -7,39 +7,35 @@
 
 from matplotlib import pyplot as plt
 
-from maro.utils import Logger
-
 timestamp = str(time.time())
 log_dir = join(dirname(realpath(__file__)), "log", timestamp)
 makedirs(log_dir, exist_ok=True)
 plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
 makedirs(plt_path, exist_ok=True)
 
-logger = Logger("SIMUALTION", dump_folder=log_dir)
-
 
 def post_collect(trackers, ep, segment):
     # print the env metric from each rollout worker
     for tracker in trackers:
-        logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
+        print(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
 
     # print the average env metric
     if len(trackers) > 1:
         metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
         avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+        print(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
 
 
 def post_evaluate(trackers, ep):
     # print the env metric from each rollout worker
     for tracker in trackers:
-        logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+        print(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
 
     # print the average env metric
     if len(trackers) > 1:
         metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
         avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+        print(f"average env metric (evaluation episode {ep}): {avg_metric}")
 
     for i, tracker in enumerate(trackers):
         core_requirement = tracker["vm_core_requirement"]
diff --git a/examples/rl/vm_scheduling/policies.py b/examples/rl/vm_scheduling/policies.py
index 92b79b72a..38383dbdc 100644
--- a/examples/rl/vm_scheduling/policies.py
+++ b/examples/rl/vm_scheduling/policies.py
@@ -53,6 +53,13 @@ def apply_gradients(self, grad):
             param.grad = grad[name]
 
         self.optim.step()
+    
+    def get_state(self):
+        return {"network": self.state_dict(), "optim": self.optim.state_dict()}
+
+    def set_state(self, state):
+        self.load_state_dict(state["network"])
+        self.optim.load_state_dict(state["optim"])
 
 
 class MyACNet(DiscreteACNet):
@@ -92,6 +99,19 @@ def apply_gradients(self, grad):
         self.actor_optim.step()
         self.critic_optim.step()
 
+    def get_state(self):
+        return {
+            "network": self.state_dict(),
+            "actor_optim": self.actor_optim.state_dict(),
+            "critic_optim": self.critic_optim.state_dict()
+        }
+
+    def set_state(self, state):
+        self.load_state_dict(state["network"])
+        self.actor_optim.load_state_dict(state["actor_optim"])
+        self.critic_optim.load_state_dict(state["critic_optim"])
+
+
 if algorithm == "dqn":
     policy_func_dict = {"dqn": lambda name: DQN(name, MyQNet(), **dqn_conf)}
 else:

From 694a6d897f026187a4506ee67db3d7a2c651d734 Mon Sep 17 00:00:00 2001
From: yaqiu <v-yaqiu@microsoft.com>
Date: Thu, 23 Sep 2021 07:22:44 +0000
Subject: [PATCH 29/29] updated README for vm_scheduling with RL

---
 examples/rl/vm_scheduling/README.md | 30 +++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/examples/rl/vm_scheduling/README.md b/examples/rl/vm_scheduling/README.md
index a428a5c33..5381c244a 100644
--- a/examples/rl/vm_scheduling/README.md
+++ b/examples/rl/vm_scheduling/README.md
@@ -1,10 +1,24 @@
 # Virtual Machine Scheduling
 
-Virtual Machine (VM) scheduling is a scenario where reinforcement learning (RL) can help the virtual machine allocator allocate compute resources intelligently. In this folder you can find:
-* ``env_wrapper.py``, which contains a function to generate an environment wrapper to interact
-with our "agent" (see below);
-* ``agent_wrapper.py``, which contains a function to generate an agent wrapper to interact
-with the environment wrapper;
-* ``policy_index``, which maps policy names to functions that create them; the functions to create DQN and Actor-Critic policies are defined in ``dqn.py`` and ``ac.py``, respectively.
-
-The code for the actual learning workflows (e.g., learner, roll-out worker and trainer) can be found under ``examples/rl/workflows``. The reason for putting it in a separate folder is that these workflows apply to any scenario, so long as the necessary component generators, such as the ones listed above, are provided. See ``README`` under ``examples/rl`` for details. We recommend that you follow this example to write your own scenarios.
\ No newline at end of file
+A virtual machine (VM) scheduler is a cloud computing service component responsible for providing compute resources to satisfy user demands. A good resource allocation policy should aim to optimize several metrics at the same time, such as user wait time, profit, energy consumption and physical machine (PM) overload. Many commercial cloud providers use rule-based policies. Alternatively, the policy can also be optimized using reinforcement learning (RL) techniques, which involves simulating with historical data. This example demonstrates how DQN and Actor-Critic algorithms can be applied to this scenario. In this folder, you can find:  
+
+* ``config.py``, which contains environment and policy configurations.
+* ``env_sampler.py``, which defines state, action and reward shaping pertinent to the policies.
+* ``policies.py``, which defines the neural network components of DQN and Actor-Critic.
+* ``callbacks.py``, which contains processing logic to be executed at step and episode levels.
+
+The scripts to run the learning workflows can be found under ``examples/rl/workflows``. See ``README`` under ``examples/rl`` for details about the general applicability of these scripts. We recommend that you follow this example to write your own scenarios.
+
+
+# Some Comments About the Results
+
+This example is meant to serve as a demonstration of using MARO's RL toolkit in a real-life scenario. In fact, we have yet to find a configuration that makes the policy learned by either DQN or Actor-Critic perform reasonably well in our experimental settings.
+
+For reference, the best results have been achieved by the ``Best Fit`` algorithm (see ``examples/vm_scheduling/rule_based_algorithm/best_fit.py`` for details). The over-subscription rate is 115% in the over-subscription settings.
+
+|Topology | PM Setting | Time Spent(s) | Total VM Requests |Successful Allocation| Energy Consumption| Total Oversubscriptions | Total Overload PMs
+|:----:|-----|:--------:|:---:|:-------:|:----:|:---:|:---:|
+|10k| 100 PMs, 32 Cores, 128 GB  | 104.98|10,000| 10,000| 2,399,610 | 0 | 0|
+|10k.oversubscription| 100 PMs, 32 Cores, 128 GB|  101.00 |10,000 |10,000| 2,386,371| 279,331 | 0|
+|336k| 880 PMs, 16 Cores, 112 GB | 7,896.37 |335,985| 109,249 |26,425,878 | 0 | 0 |
+|336k.oversubscription| 880 PMs, 16 Cores, 112 GB | 7,903.33| 335,985| 115,008 | 27,440,946 | 3,868,475 | 0