microsoft · Jinyu-W · Sep 26, 2021 · Jul 9, 2021 · Jul 9, 2021 · Jul 9, 2021
diff --git a/docker_files/dev.df b/docker_files/dev.df
@@ -20,6 +20,7 @@ RUN pip install --no-cache-dir pyzmq==19.0.2
 RUN pip install --no-cache-dir numpy==1.19.1
 RUN pip install --no-cache-dir torch==1.6.0
 RUN pip install --no-cache-dir scipy
+RUN pip install --no-cache-dir matplotlib
 RUN pip install --no-cache-dir redis
 
 COPY maro /maro/maro

diff --git a/examples/rl/vm_scheduling/README.md b/examples/rl/vm_scheduling/README.md
@@ -1,10 +1,24 @@
 # Virtual Machine Scheduling
 
-Virtual Machine (VM) scheduling is a scenario where reinforcement learning (RL) can help the virtual machine allocator allocate compute resources intelligently. In this folder you can find:
-* ``env_wrapper.py``, which contains a function to generate an environment wrapper to interact
-with our "agent" (see below);
-* ``agent_wrapper.py``, which contains a function to generate an agent wrapper to interact
-with the environment wrapper;
-* ``policy_index``, which maps policy names to functions that create them; the functions to create DQN and Actor-Critic policies are defined in ``dqn.py`` and ``ac.py``, respectively.
-
-The code for the actual learning workflows (e.g., learner, roll-out worker and trainer) can be found under ``examples/rl/workflows``. The reason for putting it in a separate folder is that these workflows apply to any scenario, so long as the necessary component generators, such as the ones listed above, are provided. See ``README`` under ``examples/rl`` for details. We recommend that you follow this example to write your own scenarios.
+A virtual machine (VM) scheduler is a cloud computing service component responsible for providing compute resources to satisfy user demands. A good resource allocation policy should aim to optimize several metrics at the same time, such as user wait time, profit, energy consumption and physical machine (PM) overload. Many commercial cloud providers use rule-based policies. Alternatively, the policy can also be optimized using reinforcement learning (RL) techniques, which involves simulating with historical data. This example demonstrates how DQN and Actor-Critic algorithms can be applied to this scenario. In this folder, you can find:  
+
+* ``config.py``, which contains environment and policy configurations.
+* ``env_sampler.py``, which defines state, action and reward shaping in the ``VMEnvSampler`` class;
+* ``policies.py``, which defines the Q-net for DQN and the network components for Actor-Critic.
+* ``callbacks.py``, which contains routines to be invoked at the end of a training or evaluation episode.
+
+The scripts to run the learning workflows can be found under ``examples/rl/workflows``. See ``README`` under ``examples/rl`` for details about the general applicability of these scripts. We recommend that you follow this example to write your own scenarios.
+
+
+# Some Comments About the Results
+
+This example is meant to serve as a demonstration of using MARO's RL toolkit in a real-life scenario. In fact, we have yet to find a configuration that makes the policy learned by either DQN or Actor-Critic perform reasonably well in our experimental settings.
+
+For reference, the best results have been achieved by the ``Best Fit`` algorithm (see ``examples/vm_scheduling/rule_based_algorithm/best_fit.py`` for details). The over-subscription rate is 115% in the over-subscription settings.
+
+|Topology | PM Setting | Time Spent(s) | Total VM Requests |Successful Allocation| Energy Consumption| Total Oversubscriptions | Total Overload PMs
+|:----:|-----|:--------:|:---:|:-------:|:----:|:---:|:---:|
+|10k| 100 PMs, 32 Cores, 128 GB  | 104.98|10,000| 10,000| 2,399,610 | 0 | 0|
+|10k.oversubscription| 100 PMs, 32 Cores, 128 GB|  101.00 |10,000 |10,000| 2,386,371| 279,331 | 0|
+|336k| 880 PMs, 16 Cores, 112 GB | 7,896.37 |335,985| 109,249 |26,425,878 | 0 | 0 |
+|336k.oversubscription| 880 PMs, 16 Cores, 112 GB | 7,903.33| 335,985| 115,008 | 27,440,946 | 3,868,475 | 0
diff --git a/examples/rl/vm_scheduling/__init__.py b/examples/rl/vm_scheduling/__init__.py
@@ -1,11 +1,8 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .callbacks import post_collect, post_evaluate
-from .env_wrapper import get_env_sampler, get_test_env_wrapper
-from .policy_index import agent2policy, rl_policy_func_index, update_trigger, warmup
-
-__all__ = [
-    "agent2policy", "post_collect", "post_evaluate", "get_env_sampler", "get_test_env_wrapper",
-    "rl_policy_func_index", "update_trigger", "warmup"
-]
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .callbacks import post_collect, post_evaluate
+from .env_sampler import agent2policy, get_env_sampler
+from .policies import policy_func_dict
+
+__all__ = ["agent2policy", "post_collect", "post_evaluate", "get_env_sampler", "policy_func_dict"]
diff --git a/examples/rl/vm_scheduling/ac.py b/examples/rl/vm_scheduling/ac.py
diff --git a/examples/rl/vm_scheduling/callbacks.py b/examples/rl/vm_scheduling/callbacks.py
@@ -5,45 +5,40 @@
 from os import makedirs
 from os.path import dirname, join, realpath
 
-import matplotlib.pyplot as plt
-
-from maro.utils import Logger
+from matplotlib import pyplot as plt
 
 timestamp = str(time.time())
-
 log_dir = join(dirname(realpath(__file__)), "log", timestamp)
 makedirs(log_dir, exist_ok=True)
-
 plt_path = join(dirname(realpath(__file__)), "plots", timestamp)
 makedirs(plt_path, exist_ok=True)
 
 
-simulation_logger = Logger("SIMUALTION", dump_folder=log_dir)
-
 def post_collect(trackers, ep, segment):
     # print the env metric from each rollout worker
     for tracker in trackers:
-        simulation_logger.info(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
+        print(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
 
     # print the average env metric
     if len(trackers) > 1:
         metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
         avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+        print(f"average env metric (episode {ep}, segment {segment}): {avg_metric}")
+
 
 def post_evaluate(trackers, ep):
     # print the env metric from each rollout worker
     for tracker in trackers:
-        simulation_logger.info(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
+        print(f"env summary (evaluation episode {ep}): {tracker['env_metric']}")
 
     # print the average env metric
     if len(trackers) > 1:
         metric_keys, num_trackers = trackers[0]["env_metric"].keys(), len(trackers)
         avg_metric = {key: sum(tr["env_metric"][key] for tr in trackers) / num_trackers for key in metric_keys}
-        simulation_logger.info(f"average env metric (evaluation episode {ep}): {avg_metric}")
+        print(f"average env metric (evaluation episode {ep}): {avg_metric}")
 
-    for i, tracker in enumerate(trackers):
-        core_requirement = tracker["vm_core_requirement"]
+    for tracker in trackers:
+        core_requirement = tracker["actions_by_core_requirement"]
         action_sequence = tracker["action_sequence"]
         # plot action sequence
         fig = plt.figure(figsize=(40, 32))

diff --git a/examples/rl/vm_scheduling/config.py b/examples/rl/vm_scheduling/config.py
@@ -0,0 +1,126 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+import torch
+from torch.optim import Adam, SGD, lr_scheduler
+
+from maro.rl.exploration import MultiLinearExplorationScheduler
+from maro.simulator import Env
+
+
+env_conf = {
+    "scenario": "vm_scheduling",
+    "topology": "azure.2019.10k",
+    "start_tick": 0,
+    "durations": 300,  # 8638
+    "snapshot_resolution": 1
+}
+
+num_pms = Env(**env_conf).business_engine._pm_amount
+pm_window_size = 1
+num_features = 2 * num_pms * pm_window_size + 4
+
+pm_attributes = ["cpu_cores_capacity", "memory_capacity", "cpu_cores_allocated", "memory_allocated"]
+# vm_attributes = ["cpu_cores_requirement", "memory_requirement", "lifetime", "remain_time", "total_income"]
+
+
+reward_shaping_conf = {
+    "alpha": 0.0,
+    "beta": 1.0
+}
+seed = 666
+
+test_env_conf = {
+    "scenario": "vm_scheduling",
+    "topology": "azure.2019.10k.oversubscription",
+    "start_tick": 0,
+    "durations": 300,
+    "snapshot_resolution": 1
+}
+test_reward_shaping_conf = {
+    "alpha": 0.0,
+    "beta": 1.0
+}
+
+test_seed = 1024
+
+algorithm = "ac"  # "dqn" or "ac"
+
+######################################### A2C settings ########################################
+actor_net_conf = {
+    "input_dim": num_features,
+    "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
+    "hidden_dims": [64, 32, 32],
+    "activation": torch.nn.LeakyReLU,
+    "softmax": True,
+    "batch_norm": False,
+    "head": True
+}
+
+critic_net_conf = {
+    "input_dim": num_features,
+    "output_dim": 1,
+    "hidden_dims": [256, 128, 64],
+    "activation": torch.nn.LeakyReLU,
+    "softmax": False,
+    "batch_norm": False,
+    "head": True
+}
+
+actor_optim_conf = (Adam, {"lr": 0.0001})
+critic_optim_conf = (SGD, {"lr": 0.001})
+
+ac_conf = {
+    "reward_discount": 0.9,
+    "grad_iters": 100,
+    "critic_loss_cls": torch.nn.MSELoss,
+    "critic_loss_coeff": 0.1,
+    "min_logp": -20,
+    "max_trajectory_len": 10000,
+    "get_loss_on_rollout": False
+}
+
+######################################### DQN settings ########################################
+q_net_conf = {
+    "input_dim": num_features,
+    "hidden_dims": [64, 128, 256],
+    "output_dim": num_pms + 1,  # action could be any PM or postponement, hence the plus 1
+    "activation": torch.nn.LeakyReLU,
+    "softmax": False,
+    "batch_norm": False,
+    "skip_connection": False,
+    "head": True,
+    "dropout_p": 0.0
+}
+
+q_net_optim_conf = (SGD, {"lr": 0.0005})
+q_net_lr_scheduler_conf = (lr_scheduler.CosineAnnealingWarmRestarts, {"T_0": 500, "T_mult": 2})
+
+
+def masked_eps_greedy(states, actions, num_actions, *, epsilon):
+    masks = states[:, num_features:]
+    return np.array([
+        action if np.random.random() > epsilon else np.random.choice(np.where(mask == 1)[0])
+        for action, mask in zip(actions, masks)
+    ])
+
+dqn_conf = {
+    "reward_discount": 0.9,
+    "update_target_every": 5,
+    "num_epochs": 100,
+    "soft_update_coeff": 0.1,
+    "double": False,
+    "exploration_strategy": (masked_eps_greedy, {"epsilon": 0.4}),
+    "exploration_scheduling_options": [(
+        "epsilon", MultiLinearExplorationScheduler, {
+            "splits": [(100, 0.32)],
+            "initial_value": 0.4,
+            "last_ep": 400,
+            "final_value": 0.0,
+        }
+    )],
+    "replay_memory_capacity": 10000,
+    "rollout_batch_size": 2560,
+    "train_batch_size": 256,
+}