From d3cd39e281b972a6793c06ad2dd4e7f3fe453708 Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Wed, 9 Dec 2020 01:36:38 -0800
Subject: [PATCH 01/14] Initial commit of gym wrapper

---
 PythonClient/airgym/__init__.py        |   6 +
 PythonClient/airgym/envs/__init__.py   |   2 +
 PythonClient/airgym/envs/airsim_env.py | 206 +++++++++++++++++++++++++
 3 files changed, 214 insertions(+)
 create mode 100644 PythonClient/airgym/__init__.py
 create mode 100644 PythonClient/airgym/envs/__init__.py
 create mode 100644 PythonClient/airgym/envs/airsim_env.py

diff --git a/PythonClient/airgym/__init__.py b/PythonClient/airgym/__init__.py
new file mode 100644
index 0000000000..004e2062da
--- /dev/null
+++ b/PythonClient/airgym/__init__.py
@@ -0,0 +1,6 @@
+from gym.envs.registration import register
+
+register(
+	id='airsim-drone-v0',
+	entry_point='air_gym.envs:AirSimDroneEnv',
+)
diff --git a/PythonClient/airgym/envs/__init__.py b/PythonClient/airgym/envs/__init__.py
new file mode 100644
index 0000000000..73601239a3
--- /dev/null
+++ b/PythonClient/airgym/envs/__init__.py
@@ -0,0 +1,2 @@
+from air_gym.envs.airsim_env import AirSimEnv
+from air_gym.envs.airsim_env import AirSimDroneEnv
diff --git a/PythonClient/airgym/envs/airsim_env.py b/PythonClient/airgym/envs/airsim_env.py
new file mode 100644
index 0000000000..178ae70ab8
--- /dev/null
+++ b/PythonClient/airgym/envs/airsim_env.py
@@ -0,0 +1,206 @@
+import numpy as np
+import airsim
+
+import gym
+from gym import error, spaces, utils
+from gym.utils import seeding
+from gym.spaces import Tuple, Box, Discrete, MultiDiscrete
+
+from collections import OrderedDict
+
+
+class AirSimEnv(gym.Env):
+    metadata = {"render.modes": ["rgb_array"]}
+
+    def __init__(self, image_shape):
+        self.observation_space = spaces.Box(0, 255, shape=image_shape, dtype=np.uint8)
+        self._seed()
+
+        self.viewer = None
+        self.steps = 0
+        self.no_episode = 0
+        self.reward_sum = 0
+
+    def __del__(self):
+        raise NotImplementedError()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _compute_reward(self):
+        raise NotImplementedError()
+
+    def step(self, action):
+        raise NotImplementedError()
+
+    def _get_obs(self):
+        raise NotImplementedError()
+
+    def close(self):
+        raise NotImplementedError()
+
+    def render(self):
+        return self._get_obs()
+
+
+class AirSimDroneEnv(AirSimEnv):
+    def __init__(
+        self, ip_address, action_type, control_type, step_length, image_shape, goal
+    ):
+        super().__init__(image_shape)
+
+        self.step_length = step_length
+        self.action_type = action_type
+        self.control_type = control_type
+        self.image_shape = image_shape
+        self.goal = airsim.Vector3r(goal[0], goal[1], goal[2])
+        self.start_ts = 0
+
+        if self.action_type is "discrete":
+            self.action_space = spaces.Discrete(6)
+
+            if control_type is not "position" and not "velocity":
+                print(
+                    "Invalid control type for discrete actions. Defaulting to position"
+                )
+                self.control_type = "position"
+            else:
+                self.control_type = control_type
+
+        elif self.action_type is "continuous":
+            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(4,))
+            if control_type is not "rate" and not "pwm":
+                print("Invalid control type for continuous actions. Defaulting to rate")
+                self.control_type = "rate"
+            else:
+                self.control_type = control_type
+        else:
+            print(
+                "Must choose an action type {'discrete','continuous'}. Defaulting to discrete."
+            )
+            self.action_space = spaces.Discrete(6)
+            self.control_type = "position"
+
+        self.state = {
+            "position": np.zeros(3),
+            "collision": False,
+            "prev_position": np.zeros(3),
+        }
+
+        self.drone = airsim.MultirotorClient(ip=ip_address)
+        self._setup_flight()
+
+        self.image_request = airsim.ImageRequest(
+            "front_center", airsim.ImageType.Scene, False, False
+        )
+
+        # List of discrete actions
+        self.forward = [self.step_length, 0, 0]
+        self.backward = [-self.step_length, 0, 0]
+        self.left = [0, -self.step_length, 0]
+        self.right = [0, self.step_length, 0]
+        self.up = [0, 0, -self.step_length]
+        self.down = [0, 0, self.step_length]
+
+        self.discrete_action_dict = {
+            0: self.forward,
+            1: self.backward,
+            2: self.right,
+            3: self.left,
+            4: self.up,
+            5: self.down,
+        }
+
+    def __del__(self):
+        self.drone.reset()
+
+    def _setup_flight(self):
+        self.drone.reset()
+        self.drone.enableApiControl(True)
+        self.drone.armDisarm(True)
+        self.drone.moveToPositionAsync(0, 0, -2, 2).join()
+
+    def _get_obs(self):
+        response = self.drone.simGetImages([self.image_request])
+        image = np.reshape(
+            np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
+            self.image_shape,
+        )
+        _drone_state = self.drone.getMultirotorState()
+        position = _drone_state.kinematics_estimated.position.to_numpy_array()
+        collision = self.drone.simGetCollisionInfo().has_collided
+
+        self.state["prev_position"] = self.state["position"]
+        self.state["position"] = position
+        self.state["collision"] = collision
+
+        return image
+
+    def _compute_reward(self):
+        pos = self.state["position"]
+        current_pos = airsim.Vector3r(pos[0], pos[1], pos[2])
+        done = False
+
+        if current_pos == self.goal:
+            done = True
+            reward = 100
+            return reward, done
+        elif self.state["collision"] == True:
+            done = True
+            reward = -100
+
+        dist = current_pos.distance_to(self.goal)
+        reward += abs(pos[1]) - abs(prev_pos[1])
+
+        return reward, done
+
+    def _do_action(self, action):
+        if self.action_type is "discrete":
+            command = self.action_to_command(action)
+            if self.control_type is "position":
+                pos = self.state["position"]
+                self.pose.position.x_val = float(pos[0] + action[0])
+                self.pose.position.y_val = float(pos[1] + action[1])
+                self.pose.position.z_val = float(pos[2] + action[2])
+                self.drone.simSetVehiclePose(self.pose, False)
+
+            elif self.control_type is "velocity":
+                self.drone.moveByVelocityZAsync(
+                    float(action[0]),
+                    float(action[1]),
+                    float(action[2]),
+                    self.step_length,
+                ).join()
+
+        elif self.action_type is "continuous":
+            if self.control_type is "rate":
+                self.drone.moveByRollPitchYawrateThrottleAsync(
+                    float(action[0]),
+                    float(action[1]),
+                    float(action[2]),
+                    float(action[3]),
+                    self.step_length,
+                ).join()
+
+            elif self.control_type == "pwm":
+                self.drone.moveByMotorPWMsAsync(
+                    float(action[0]),
+                    float(action[1]),
+                    float(action[2]),
+                    float(action[3]),
+                )
+
+    def step(self, action):
+        self._do_action(action)
+        obs = self._get_obs()
+        reward, done = self._compute_reward()
+
+        return obs, reward, done, self.state
+
+    def reset(self):
+        self._setup_flight()
+        self._get_obs()
+
+    def actions_to_op(self, action):
+        return self.discrete_action_dict[action]

From 0eefc660c4a5eb66235b52701be5cd3126b6a4bd Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Sat, 12 Dec 2020 00:20:05 -0800
Subject: [PATCH 02/14] Intermediate commit, car env

---
 PythonClient/airgym/envs/__init__.py   |   4 +-
 PythonClient/airgym/envs/airsim_env.py | 138 +++++++++++++++++++++++--
 2 files changed, 134 insertions(+), 8 deletions(-)

diff --git a/PythonClient/airgym/envs/__init__.py b/PythonClient/airgym/envs/__init__.py
index 73601239a3..7185769202 100644
--- a/PythonClient/airgym/envs/__init__.py
+++ b/PythonClient/airgym/envs/__init__.py
@@ -1,2 +1,2 @@
-from air_gym.envs.airsim_env import AirSimEnv
-from air_gym.envs.airsim_env import AirSimDroneEnv
+from airgym.envs.airsim_env import AirSimEnv
+from airgym.envs.airsim_env import AirSimDroneEnv
diff --git a/PythonClient/airgym/envs/airsim_env.py b/PythonClient/airgym/envs/airsim_env.py
index 178ae70ab8..2018685a50 100644
--- a/PythonClient/airgym/envs/airsim_env.py
+++ b/PythonClient/airgym/envs/airsim_env.py
@@ -2,11 +2,8 @@
 import airsim
 
 import gym
-from gym import error, spaces, utils
+from gym import spaces
 from gym.utils import seeding
-from gym.spaces import Tuple, Box, Discrete, MultiDiscrete
-
-from collections import OrderedDict
 
 
 class AirSimEnv(gym.Env):
@@ -121,7 +118,7 @@ def _setup_flight(self):
         self.drone.armDisarm(True)
         self.drone.moveToPositionAsync(0, 0, -2, 2).join()
 
-    def _get_obs(self):
+    def get_observation(self):
         response = self.drone.simGetImages([self.image_request])
         image = np.reshape(
             np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
@@ -137,7 +134,7 @@ def _get_obs(self):
 
         return image
 
-    def _compute_reward(self):
+    def compute_reward(self):
         pos = self.state["position"]
         current_pos = airsim.Vector3r(pos[0], pos[1], pos[2])
         done = False
@@ -204,3 +201,132 @@ def reset(self):
 
     def actions_to_op(self, action):
         return self.discrete_action_dict[action]
+
+
+class AirSimCarEnv(AirSimEnv):
+    def __init__(
+        self, ip_address, action_type, control_type, step_length, image_shape, goal
+    ):
+        super().__init__(image_shape)
+
+        self.step_length = step_length
+        self.action_type = action_type
+        self.control_type = control_type
+        self.image_shape = image_shape
+        self.goal = airsim.Vector3r(goal[0], goal[1], goal[2])
+        self.start_ts = 0
+
+        if self.action_type is "discrete":
+            self.action_space = spaces.Discrete(6)
+
+            if control_type is not "position" and not "velocity":
+                print(
+                    "Invalid control type for discrete actions. Defaulting to position"
+                )
+                self.control_type = "position"
+            else:
+                self.control_type = control_type
+
+        elif self.action_type is "continuous":
+            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(4,))
+            if control_type is not "rate" and not "pwm":
+                print("Invalid control type for continuous actions. Defaulting to rate")
+                self.control_type = "rate"
+            else:
+                self.control_type = control_type
+        else:
+            print(
+                "Must choose an action type {'discrete','continuous'}. Defaulting to discrete."
+            )
+            self.action_space = spaces.Discrete(6)
+            self.control_type = "position"
+
+        self.state = {
+            "position": np.zeros(3),
+            "collision": False,
+            "prev_position": np.zeros(3),
+        }
+
+        self.car = airsim.CarClient(ip=ip_address)
+
+        self.image_request = airsim.ImageRequest(
+            "front_center", airsim.ImageType.Scene, False, False
+        )
+
+        self.self.car_controls = airsim.CarControls()
+
+    def __del__(self):
+        self.drone.reset()
+
+    def _setup_flight(self):
+        self.drone.reset()
+        self.drone.enableApiControl(True)
+        self.drone.armDisarm(True)
+        self.drone.moveToPositionAsync(0, 0, -2, 2).join()
+
+    def _get_obs(self):
+        response = self.drone.simGetImages([self.image_request])
+        image = np.reshape(
+            np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
+            self.image_shape,
+        )
+        _drone_state = self.drone.getMultirotorState()
+        position = _drone_state.kinematics_estimated.position.to_numpy_array()
+        collision = self.drone.simGetCollisionInfo().has_collided
+
+        self.state["prev_position"] = self.state["position"]
+        self.state["position"] = position
+        self.state["collision"] = collision
+
+        return image
+
+    def _compute_reward(self):
+        pos = self.state["position"]
+        current_pos = airsim.Vector3r(pos[0], pos[1], pos[2])
+        done = False
+
+        if current_pos == self.goal:
+            done = True
+            reward = 100
+            return reward, done
+        elif self.state["collision"] == True:
+            done = True
+            reward = -100
+
+        dist = current_pos.distance_to(self.goal)
+        reward += abs(pos[1]) - abs(prev_pos[1])
+
+        return reward, done
+
+    def take_action(self, action):
+        self.car_controls.brake = 0
+        self.car_controls.throttle = 1
+        if action == 0:
+            self.car_controls.throttle = 0
+            self.car_controls.brake = 1
+        elif action == 1:
+            self.car_controls.steering = 0
+        elif action == 2:
+            self.car_controls.steering = 0.5
+        elif action == 3:
+            self.car_controls.steering = -0.5
+        elif action == 4:
+            self.car_controls.steering = 0.25
+        else:
+            self.car_controls.steering = -0.25
+
+        self.car.setCarControls(self.car_controls)
+
+    def step(self, action):
+        self._do_action(action)
+        obs = self._get_obs()
+        reward, done = self._compute_reward()
+
+        return obs, reward, done, self.state
+
+    def reset(self):
+        self._setup_flight()
+        self._get_obs()
+
+    def actions_to_op(self, action):
+        return self.discrete_action_dict[action]

From f99cffed214501887cb44a7139297eea2c7509ad Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Mon, 14 Dec 2020 18:51:37 -0800
Subject: [PATCH 03/14] Intermediate changes

---
 PythonClient/airgym/envs/airsim_env.py | 56 ++----------------
 PythonClient/car/DQNcar copy.py        | 80 ++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 51 deletions(-)
 create mode 100644 PythonClient/car/DQNcar copy.py

diff --git a/PythonClient/airgym/envs/airsim_env.py b/PythonClient/airgym/envs/airsim_env.py
index 2018685a50..436f97b207 100644
--- a/PythonClient/airgym/envs/airsim_env.py
+++ b/PythonClient/airgym/envs/airsim_env.py
@@ -204,43 +204,12 @@ def actions_to_op(self, action):
 
 
 class AirSimCarEnv(AirSimEnv):
-    def __init__(
-        self, ip_address, action_type, control_type, step_length, image_shape, goal
-    ):
+    def __init__(self, ip_address, image_shape):
         super().__init__(image_shape)
 
-        self.step_length = step_length
-        self.action_type = action_type
-        self.control_type = control_type
         self.image_shape = image_shape
-        self.goal = airsim.Vector3r(goal[0], goal[1], goal[2])
         self.start_ts = 0
 
-        if self.action_type is "discrete":
-            self.action_space = spaces.Discrete(6)
-
-            if control_type is not "position" and not "velocity":
-                print(
-                    "Invalid control type for discrete actions. Defaulting to position"
-                )
-                self.control_type = "position"
-            else:
-                self.control_type = control_type
-
-        elif self.action_type is "continuous":
-            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(4,))
-            if control_type is not "rate" and not "pwm":
-                print("Invalid control type for continuous actions. Defaulting to rate")
-                self.control_type = "rate"
-            else:
-                self.control_type = control_type
-        else:
-            print(
-                "Must choose an action type {'discrete','continuous'}. Defaulting to discrete."
-            )
-            self.action_space = spaces.Discrete(6)
-            self.control_type = "position"
-
         self.state = {
             "position": np.zeros(3),
             "collision": False,
@@ -253,12 +222,12 @@ def __init__(
             "front_center", airsim.ImageType.Scene, False, False
         )
 
-        self.self.car_controls = airsim.CarControls()
+        self.car_controls = airsim.CarControls()
 
     def __del__(self):
         self.drone.reset()
 
-    def _setup_flight(self):
+    def _setup_car(self):
         self.drone.reset()
         self.drone.enableApiControl(True)
         self.drone.armDisarm(True)
@@ -280,23 +249,8 @@ def _get_obs(self):
 
         return image
 
-    def _compute_reward(self):
-        pos = self.state["position"]
-        current_pos = airsim.Vector3r(pos[0], pos[1], pos[2])
-        done = False
-
-        if current_pos == self.goal:
-            done = True
-            reward = 100
-            return reward, done
-        elif self.state["collision"] == True:
-            done = True
-            reward = -100
-
-        dist = current_pos.distance_to(self.goal)
-        reward += abs(pos[1]) - abs(prev_pos[1])
-
-        return reward, done
+    def compute_reward(self):
+        NotImplemented()
 
     def take_action(self, action):
         self.car_controls.brake = 0
diff --git a/PythonClient/car/DQNcar copy.py b/PythonClient/car/DQNcar copy.py
new file mode 100644
index 0000000000..81dd5a5d10
--- /dev/null
+++ b/PythonClient/car/DQNcar copy.py	
@@ -0,0 +1,80 @@
+import setup_path
+import airsim
+import numpy as np
+import math
+import time
+from argparse import ArgumentParser
+
+from airgym.envs.airsim_env import AirSimCarEnv
+
+
+class AirSimDriveEnv(AirSimCarEnv):
+    def __init__(self, ip_address, step_length, image_shape):
+        super().__init__(self, ip_address, step_length, image_shape)
+
+    def take_action(self, action):
+        self.car_controls.brake = 0
+        self.car_controls.throttle = 1
+        if action == 0:
+            self.car_controls.throttle = 0
+            self.car_controls.brake = 1
+        elif action == 1:
+            self.car_controls.steering = 0
+        elif action == 2:
+            self.car_controls.steering = 0.5
+        elif action == 3:
+            self.car_controls.steering = -0.5
+        elif action == 4:
+            self.car_controls.steering = 0.25
+        else:
+            self.car_controls.steering = -0.25
+
+        self.car.setCarControls(self.car_controls)
+
+    def compute_reward(self):
+        MAX_SPEED = 300
+        MIN_SPEED = 10
+        thresh_dist = 3.5
+        beta = 3
+
+        z = 0
+        pts = [
+            np.array([0, -1, z]),
+            np.array([130, -1, z]),
+            np.array([130, 125, z]),
+            np.array([0, 125, z]),
+            np.array([0, -1, z]),
+            np.array([130, -1, z]),
+            np.array([130, -128, z]),
+            np.array([0, -128, z]),
+            np.array([0, -1, z]),
+        ]
+        pd = car_state.kinematics_estimated.position
+        car_pt = np.array([pd.x_val, pd.y_val, pd.z_val])
+
+        dist = 10000000
+        for i in range(0, len(pts) - 1):
+            dist = min(
+                dist,
+                np.linalg.norm(np.cross((car_pt - pts[i]), (car_pt - pts[i + 1])))
+                / np.linalg.norm(pts[i] - pts[i + 1]),
+            )
+
+        # print(dist)
+        if dist > thresh_dist:
+            reward = -3
+        else:
+            reward_dist = math.exp(-beta * dist) - 0.5
+            reward_speed = (
+                (car_state.speed - MIN_SPEED) / (MAX_SPEED - MIN_SPEED)
+            ) - 0.5
+            reward = reward_dist + reward_speed
+
+        done = 0
+        if reward < -1:
+            done = 1
+        if self.car_controls.brake == 0:
+            if self.car_state.speed <= 5:
+                done = 1
+
+        return reward, done

From c4017a9ebde0d471bb0e6a059caf357d421b00a5 Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Mon, 14 Dec 2020 23:43:22 -0800
Subject: [PATCH 04/14] Restructure

---
 PythonClient/airgym/__init__.py               |   6 -
 PythonClient/airgym/envs/__init__.py          |   2 -
 .../reinforcement_learning/airgym/__init__.py |   9 +
 .../airgym/envs/__init__.py                   |   4 +
 .../airgym/envs/airsim_env.py                 |  41 ++++
 .../airgym/envs/car_env.py}                   |  55 +++++-
 .../airgym/envs/drone_env.py}                 | 179 +++++-------------
 .../reinforcement_learning/dqn_car.py         |  43 +++++
 .../reinforcement_learning/dqn_drone.py       |  46 +++++
 .../reinforcement_learning/setup_path.py      |  52 +++++
 10 files changed, 286 insertions(+), 151 deletions(-)
 delete mode 100644 PythonClient/airgym/__init__.py
 delete mode 100644 PythonClient/airgym/envs/__init__.py
 create mode 100644 PythonClient/reinforcement_learning/airgym/__init__.py
 create mode 100644 PythonClient/reinforcement_learning/airgym/envs/__init__.py
 create mode 100644 PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
 rename PythonClient/{car/DQNcar copy.py => reinforcement_learning/airgym/envs/car_env.py} (55%)
 rename PythonClient/{airgym/envs/airsim_env.py => reinforcement_learning/airgym/envs/drone_env.py} (57%)
 create mode 100644 PythonClient/reinforcement_learning/dqn_car.py
 create mode 100644 PythonClient/reinforcement_learning/dqn_drone.py
 create mode 100644 PythonClient/reinforcement_learning/setup_path.py

diff --git a/PythonClient/airgym/__init__.py b/PythonClient/airgym/__init__.py
deleted file mode 100644
index 004e2062da..0000000000
--- a/PythonClient/airgym/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from gym.envs.registration import register
-
-register(
-	id='airsim-drone-v0',
-	entry_point='air_gym.envs:AirSimDroneEnv',
-)
diff --git a/PythonClient/airgym/envs/__init__.py b/PythonClient/airgym/envs/__init__.py
deleted file mode 100644
index 7185769202..0000000000
--- a/PythonClient/airgym/envs/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from airgym.envs.airsim_env import AirSimEnv
-from airgym.envs.airsim_env import AirSimDroneEnv
diff --git a/PythonClient/reinforcement_learning/airgym/__init__.py b/PythonClient/reinforcement_learning/airgym/__init__.py
new file mode 100644
index 0000000000..08cf191667
--- /dev/null
+++ b/PythonClient/reinforcement_learning/airgym/__init__.py
@@ -0,0 +1,9 @@
+from gym.envs.registration import register
+
+register(
+    id="airsim-drone-trackpowerlines-v0", entry_point="airgym.envs:AirSimDroneEnv",
+)
+
+register(
+    id="airsim-car-drive-v0", entry_point="airgym.envs:AirSimCarEnv",
+)
diff --git a/PythonClient/reinforcement_learning/airgym/envs/__init__.py b/PythonClient/reinforcement_learning/airgym/envs/__init__.py
new file mode 100644
index 0000000000..9a3b34c33f
--- /dev/null
+++ b/PythonClient/reinforcement_learning/airgym/envs/__init__.py
@@ -0,0 +1,4 @@
+from airgym.envs.airsim_env import AirSimEnv
+from airgym.envs.car_env import AirSimCarEnv
+from airgym.envs.drone_env import AirSimDroneEnv
+
diff --git a/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py b/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
new file mode 100644
index 0000000000..e6620fe0ec
--- /dev/null
+++ b/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
@@ -0,0 +1,41 @@
+import numpy as np
+import airsim
+
+import gym
+from gym import spaces
+from gym.utils import seeding
+
+
+class AirSimEnv(gym.Env):
+    metadata = {"render.modes": ["rgb_array"]}
+
+    def __init__(self, image_shape):
+        self.observation_space = spaces.Box(0, 255, shape=image_shape, dtype=np.uint8)
+        self._seed()
+
+        self.viewer = None
+        self.steps = 0
+        self.no_episode = 0
+        self.reward_sum = 0
+
+    def __del__(self):
+        raise NotImplementedError()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _compute_reward(self):
+        raise NotImplementedError()
+
+    def step(self, action):
+        raise NotImplementedError()
+
+    def _get_obs(self):
+        raise NotImplementedError()
+
+    def close(self):
+        raise NotImplementedError()
+
+    def render(self):
+        return self._get_obs()
diff --git a/PythonClient/car/DQNcar copy.py b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
similarity index 55%
rename from PythonClient/car/DQNcar copy.py
rename to PythonClient/reinforcement_learning/airgym/envs/car_env.py
index 81dd5a5d10..64a2f991f3 100644
--- a/PythonClient/car/DQNcar copy.py	
+++ b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
@@ -4,15 +4,39 @@
 import math
 import time
 from argparse import ArgumentParser
+from airgym.envs.airsim_env import AirSimEnv
 
-from airgym.envs.airsim_env import AirSimCarEnv
 
+class AirSimCarEnv(AirSimEnv):
+    def __init__(self, ip_address, image_shape):
+        super().__init__(image_shape)
 
-class AirSimDriveEnv(AirSimCarEnv):
-    def __init__(self, ip_address, step_length, image_shape):
-        super().__init__(self, ip_address, step_length, image_shape)
+        self.image_shape = image_shape
+        self.start_ts = 0
 
-    def take_action(self, action):
+        self.state = {
+            "position": np.zeros(3),
+            "collision": False,
+            "prev_position": np.zeros(3),
+        }
+
+        self.car = airsim.CarClient(ip=ip_address)
+
+        self.image_request = airsim.ImageRequest(
+            "front_center", airsim.ImageType.Scene, False, False
+        )
+
+        self.car_controls = airsim.CarControls()
+        self.car_state = None
+
+        self.state["pose"] = None
+        self.state["prev_pose"] = None
+        self.state["collision"] = None
+
+    def __del__(self):
+        self.car.reset()
+
+    def _do_action(self, action):
         self.car_controls.brake = 0
         self.car_controls.throttle = 1
         if action == 0:
@@ -31,7 +55,22 @@ def take_action(self, action):
 
         self.car.setCarControls(self.car_controls)
 
-    def compute_reward(self):
+    def _get_obs(self):
+        response = self.drone.simGetImages([self.image_request])
+        image = np.reshape(
+            np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
+            self.image_shape,
+        )
+        self.car_state = self.car.getCarState()
+        collision = self.car.simGetCollisionInfo().has_collided
+
+        self.state["prev_pose"] = self.state["pose"]
+        self.state["pose"] = self.car_state.kinematics_estimated
+        self.state["collision"] = collision
+
+        return image
+
+    def _compute_reward(self):
         MAX_SPEED = 300
         MIN_SPEED = 10
         thresh_dist = 3.5
@@ -49,7 +88,7 @@ def compute_reward(self):
             np.array([0, -128, z]),
             np.array([0, -1, z]),
         ]
-        pd = car_state.kinematics_estimated.position
+        pd = self.state["pose"].position
         car_pt = np.array([pd.x_val, pd.y_val, pd.z_val])
 
         dist = 10000000
@@ -66,7 +105,7 @@ def compute_reward(self):
         else:
             reward_dist = math.exp(-beta * dist) - 0.5
             reward_speed = (
-                (car_state.speed - MIN_SPEED) / (MAX_SPEED - MIN_SPEED)
+                (self.car_state.speed - MIN_SPEED) / (MAX_SPEED - MIN_SPEED)
             ) - 0.5
             reward = reward_dist + reward_speed
 
diff --git a/PythonClient/airgym/envs/airsim_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
similarity index 57%
rename from PythonClient/airgym/envs/airsim_env.py
rename to PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index 436f97b207..dfafc5f343 100644
--- a/PythonClient/airgym/envs/airsim_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -1,44 +1,10 @@
-import numpy as np
+import setup_path
 import airsim
-
-import gym
-from gym import spaces
-from gym.utils import seeding
-
-
-class AirSimEnv(gym.Env):
-    metadata = {"render.modes": ["rgb_array"]}
-
-    def __init__(self, image_shape):
-        self.observation_space = spaces.Box(0, 255, shape=image_shape, dtype=np.uint8)
-        self._seed()
-
-        self.viewer = None
-        self.steps = 0
-        self.no_episode = 0
-        self.reward_sum = 0
-
-    def __del__(self):
-        raise NotImplementedError()
-
-    def _seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def _compute_reward(self):
-        raise NotImplementedError()
-
-    def step(self, action):
-        raise NotImplementedError()
-
-    def _get_obs(self):
-        raise NotImplementedError()
-
-    def close(self):
-        raise NotImplementedError()
-
-    def render(self):
-        return self._get_obs()
+import numpy as np
+import math
+import time
+from argparse import ArgumentParser
+from airgym.envs.airsim_env import AirSimEnv
 
 
 class AirSimDroneEnv(AirSimEnv):
@@ -118,7 +84,7 @@ def _setup_flight(self):
         self.drone.armDisarm(True)
         self.drone.moveToPositionAsync(0, 0, -2, 2).join()
 
-    def get_observation(self):
+    def _get_obs(self):
         response = self.drone.simGetImages([self.image_request])
         image = np.reshape(
             np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
@@ -134,24 +100,6 @@ def get_observation(self):
 
         return image
 
-    def compute_reward(self):
-        pos = self.state["position"]
-        current_pos = airsim.Vector3r(pos[0], pos[1], pos[2])
-        done = False
-
-        if current_pos == self.goal:
-            done = True
-            reward = 100
-            return reward, done
-        elif self.state["collision"] == True:
-            done = True
-            reward = -100
-
-        dist = current_pos.distance_to(self.goal)
-        reward += abs(pos[1]) - abs(prev_pos[1])
-
-        return reward, done
-
     def _do_action(self, action):
         if self.action_type is "discrete":
             command = self.action_to_command(action)
@@ -188,88 +136,49 @@ def _do_action(self, action):
                     float(action[3]),
                 )
 
-    def step(self, action):
-        self._do_action(action)
-        obs = self._get_obs()
-        reward, done = self._compute_reward()
-
-        return obs, reward, done, self.state
-
-    def reset(self):
-        self._setup_flight()
-        self._get_obs()
-
-    def actions_to_op(self, action):
-        return self.discrete_action_dict[action]
-
-
-class AirSimCarEnv(AirSimEnv):
-    def __init__(self, ip_address, image_shape):
-        super().__init__(image_shape)
-
-        self.image_shape = image_shape
-        self.start_ts = 0
-
-        self.state = {
-            "position": np.zeros(3),
-            "collision": False,
-            "prev_position": np.zeros(3),
-        }
-
-        self.car = airsim.CarClient(ip=ip_address)
-
-        self.image_request = airsim.ImageRequest(
-            "front_center", airsim.ImageType.Scene, False, False
-        )
-
-        self.car_controls = airsim.CarControls()
+    def compute_reward(self):
+        thresh_dist = 7
+        beta = 1
 
-    def __del__(self):
-        self.drone.reset()
+        z = -10
+        pts = [
+            np.array([-0.55265, -31.9786, -19.0225]),
+            np.array([48.59735, -63.3286, -60.07256]),
+            np.array([193.5974, -55.0786, -46.32256]),
+            np.array([369.2474, 35.32137, -62.5725]),
+            np.array([541.3474, 143.6714, -32.07256]),
+        ]
 
-    def _setup_car(self):
-        self.drone.reset()
-        self.drone.enableApiControl(True)
-        self.drone.armDisarm(True)
-        self.drone.moveToPositionAsync(0, 0, -2, 2).join()
+        quad_pt = np.array(list((quad_state.x_val, quad_state.y_val, quad_state.z_val)))
 
-    def _get_obs(self):
-        response = self.drone.simGetImages([self.image_request])
-        image = np.reshape(
-            np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
-            self.image_shape,
-        )
-        _drone_state = self.drone.getMultirotorState()
-        position = _drone_state.kinematics_estimated.position.to_numpy_array()
-        collision = self.drone.simGetCollisionInfo().has_collided
-
-        self.state["prev_position"] = self.state["position"]
-        self.state["position"] = position
-        self.state["collision"] = collision
-
-        return image
+        if collision_info.has_collided:
+            reward = -100
+        else:
+            dist = 10000000
+            for i in range(0, len(pts) - 1):
+                dist = min(
+                    dist,
+                    np.linalg.norm(np.cross((quad_pt - pts[i]), (quad_pt - pts[i + 1])))
+                    / np.linalg.norm(pts[i] - pts[i + 1]),
+                )
 
-    def compute_reward(self):
-        NotImplemented()
+            # print(dist)
+            if dist > thresh_dist:
+                reward = -10
+            else:
+                reward_dist = math.exp(-beta * dist) - 0.5
+                reward_speed = (
+                    np.linalg.norm([quad_vel.x_val, quad_vel.y_val, quad_vel.z_val])
+                    - 0.5
+                )
+                reward = reward_dist + reward_speed
 
-    def take_action(self, action):
-        self.car_controls.brake = 0
-        self.car_controls.throttle = 1
-        if action == 0:
-            self.car_controls.throttle = 0
-            self.car_controls.brake = 1
-        elif action == 1:
-            self.car_controls.steering = 0
-        elif action == 2:
-            self.car_controls.steering = 0.5
-        elif action == 3:
-            self.car_controls.steering = -0.5
-        elif action == 4:
-            self.car_controls.steering = 0.25
-        else:
-            self.car_controls.steering = -0.25
+        done = 0
+        if reward <= -10:
+            done = 1
+        return done
 
-        self.car.setCarControls(self.car_controls)
+        return reward, done
 
     def step(self, action):
         self._do_action(action)
diff --git a/PythonClient/reinforcement_learning/dqn_car.py b/PythonClient/reinforcement_learning/dqn_car.py
new file mode 100644
index 0000000000..a16d30c3d2
--- /dev/null
+++ b/PythonClient/reinforcement_learning/dqn_car.py
@@ -0,0 +1,43 @@
+import setup_path
+import gym
+import airgym
+
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+import torch
+
+from stable_baselines3 import DQN, PPO
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.vec_env import DummyVecEnv
+from stable_baselines3.common.evaluation import evaluate_policy
+from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
+
+env = DummyVecEnv(
+    [
+        lambda: Monitor(
+            gym.make(
+                "airgym:airsim-car-drive-v0",
+                ip_address="127.0.0.1",
+                image_shape=(64, 64, 3),
+            )
+        )
+    ]
+)
+
+model = DQN(
+    "CnnPolicy",
+    env,
+    learning_rate=0.00025,
+    verbose=1,
+    batch_size=32,
+    train_freq=4,
+    target_update_interval=10000,
+    learning_starts=200000,
+    buffer_size=500000,
+    max_grad_norm=10,
+    exploration_fraction=0.1,
+    exploration_final_eps=0.01,
+    device="cuda",
+    tensorboard_log="./tb_logs/",
+)
diff --git a/PythonClient/reinforcement_learning/dqn_drone.py b/PythonClient/reinforcement_learning/dqn_drone.py
new file mode 100644
index 0000000000..94ff107705
--- /dev/null
+++ b/PythonClient/reinforcement_learning/dqn_drone.py
@@ -0,0 +1,46 @@
+import setup_path
+import gym
+import airgym
+
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+import torch
+
+from stable_baselines3 import DQN, PPO
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.vec_env import DummyVecEnv
+from stable_baselines3.common.evaluation import evaluate_policy
+from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
+
+env = DummyVecEnv(
+    [
+        lambda: Monitor(
+            gym.make(
+                "airgym:airsim-drone-trackpowerlines-v0",
+                ip_address="127.0.0.1",
+                action_type="discrete",
+                control_type="position",
+                step_length=0.25,
+                image_shape=(64, 64, 3),
+            )
+        )
+    ]
+)
+
+model = DQN(
+    "CnnPolicy",
+    env,
+    learning_rate=0.00025,
+    verbose=1,
+    batch_size=32,
+    train_freq=4,
+    target_update_interval=10000,
+    learning_starts=10000,
+    buffer_size=500000,
+    max_grad_norm=10,
+    exploration_fraction=0.1,
+    exploration_final_eps=0.01,
+    device="cuda",
+    tensorboard_log="./tb_logs/",
+)
diff --git a/PythonClient/reinforcement_learning/setup_path.py b/PythonClient/reinforcement_learning/setup_path.py
new file mode 100644
index 0000000000..362113fea8
--- /dev/null
+++ b/PythonClient/reinforcement_learning/setup_path.py
@@ -0,0 +1,52 @@
+# Import this module to automatically setup path to local airsim module
+# This module first tries to see if airsim module is installed via pip
+# If it does then we don't do anything else
+# Else we look up grand-parent folder to see if it has airsim folder
+#    and if it does then we add that in sys.path
+
+import os,sys,inspect,logging
+
+#this class simply tries to see if airsim 
+class SetupPath:
+    @staticmethod
+    def getDirLevels(path):
+        path_norm = os.path.normpath(path)
+        return len(path_norm.split(os.sep))
+
+    @staticmethod
+    def getCurrentPath():
+        cur_filepath = os.path.abspath(inspect.getfile(inspect.currentframe()))
+        return os.path.dirname(cur_filepath)
+
+    @staticmethod
+    def getGrandParentDir():
+        cur_path = SetupPath.getCurrentPath()
+        if SetupPath.getDirLevels(cur_path) >= 2:
+            return os.path.dirname(os.path.dirname(cur_path))
+        return ''
+
+    @staticmethod
+    def getParentDir():
+        cur_path = SetupPath.getCurrentPath()
+        if SetupPath.getDirLevels(cur_path) >= 1:
+            return os.path.dirname(cur_path)
+        return ''
+
+    @staticmethod
+    def addAirSimModulePath():
+        # if airsim module is installed then don't do anything else
+        #import pkgutil
+        #airsim_loader = pkgutil.find_loader('airsim')
+        #if airsim_loader is not None:
+        #    return
+
+        parent = SetupPath.getParentDir()
+        if parent !=  '':
+            airsim_path = os.path.join(parent, 'airsim')
+            client_path = os.path.join(airsim_path, 'client.py')
+            if os.path.exists(client_path):
+                sys.path.insert(0, parent)
+        else:
+            logging.warning("airsim module not found in parent folder. Using installed package (pip install airsim).")
+
+SetupPath.addAirSimModulePath()

From 24a5b0059a61e77638566eea5e9f4160ea8d15d2 Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Tue, 15 Dec 2020 00:11:31 -0800
Subject: [PATCH 05/14] Restructure

---
 .../airgym/envs/airsim_env.py                 | 17 ++++-----------
 .../airgym/envs/car_env.py                    |  1 +
 .../airgym/envs/drone_env.py                  | 21 -------------------
 3 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py b/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
index e6620fe0ec..5a8c45e7d9 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
@@ -11,30 +11,21 @@ class AirSimEnv(gym.Env):
 
     def __init__(self, image_shape):
         self.observation_space = spaces.Box(0, 255, shape=image_shape, dtype=np.uint8)
-        self._seed()
-
         self.viewer = None
-        self.steps = 0
-        self.no_episode = 0
-        self.reward_sum = 0
 
     def __del__(self):
         raise NotImplementedError()
 
-    def _seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def _compute_reward(self):
+    def _get_obs(self):
         raise NotImplementedError()
 
-    def step(self, action):
+    def _compute_reward(self):
         raise NotImplementedError()
 
-    def _get_obs(self):
+    def close(self):
         raise NotImplementedError()
 
-    def close(self):
+    def step(self, action):
         raise NotImplementedError()
 
     def render(self):
diff --git a/PythonClient/reinforcement_learning/airgym/envs/car_env.py b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
index 64a2f991f3..0ccc1cadf9 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/car_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
@@ -21,6 +21,7 @@ def __init__(self, ip_address, image_shape):
         }
 
         self.car = airsim.CarClient(ip=ip_address)
+        self.action_space = spaces.Discrete(6)
 
         self.image_request = airsim.ImageRequest(
             "front_center", airsim.ImageType.Scene, False, False
diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index dfafc5f343..e76af1d4f5 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -22,7 +22,6 @@ def __init__(
 
         if self.action_type is "discrete":
             self.action_space = spaces.Discrete(6)
-
             if control_type is not "position" and not "velocity":
                 print(
                     "Invalid control type for discrete actions. Defaulting to position"
@@ -58,23 +57,6 @@ def __init__(
             "front_center", airsim.ImageType.Scene, False, False
         )
 
-        # List of discrete actions
-        self.forward = [self.step_length, 0, 0]
-        self.backward = [-self.step_length, 0, 0]
-        self.left = [0, -self.step_length, 0]
-        self.right = [0, self.step_length, 0]
-        self.up = [0, 0, -self.step_length]
-        self.down = [0, 0, self.step_length]
-
-        self.discrete_action_dict = {
-            0: self.forward,
-            1: self.backward,
-            2: self.right,
-            3: self.left,
-            4: self.up,
-            5: self.down,
-        }
-
     def __del__(self):
         self.drone.reset()
 
@@ -190,6 +172,3 @@ def step(self, action):
     def reset(self):
         self._setup_flight()
         self._get_obs()
-
-    def actions_to_op(self, action):
-        return self.discrete_action_dict[action]

From 14ce38441db8984f3a3dac5db08511de9328fcca Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Tue, 15 Dec 2020 00:23:42 -0800
Subject: [PATCH 06/14] Replicate old DQN environment for drone

---
 .../airgym/envs/drone_env.py                  | 124 ++++++++----------
 1 file changed, 54 insertions(+), 70 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index e76af1d4f5..3f20ab8cf5 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -8,41 +8,13 @@
 
 
 class AirSimDroneEnv(AirSimEnv):
-    def __init__(
-        self, ip_address, action_type, control_type, step_length, image_shape, goal
-    ):
+    def __init__(self, ip_address, image_shape):
         super().__init__(image_shape)
 
         self.step_length = step_length
         self.action_type = action_type
         self.control_type = control_type
         self.image_shape = image_shape
-        self.goal = airsim.Vector3r(goal[0], goal[1], goal[2])
-        self.start_ts = 0
-
-        if self.action_type is "discrete":
-            self.action_space = spaces.Discrete(6)
-            if control_type is not "position" and not "velocity":
-                print(
-                    "Invalid control type for discrete actions. Defaulting to position"
-                )
-                self.control_type = "position"
-            else:
-                self.control_type = control_type
-
-        elif self.action_type is "continuous":
-            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(4,))
-            if control_type is not "rate" and not "pwm":
-                print("Invalid control type for continuous actions. Defaulting to rate")
-                self.control_type = "rate"
-            else:
-                self.control_type = control_type
-        else:
-            print(
-                "Must choose an action type {'discrete','continuous'}. Defaulting to discrete."
-            )
-            self.action_space = spaces.Discrete(6)
-            self.control_type = "position"
 
         self.state = {
             "position": np.zeros(3),
@@ -64,7 +36,10 @@ def _setup_flight(self):
         self.drone.reset()
         self.drone.enableApiControl(True)
         self.drone.armDisarm(True)
-        self.drone.moveToPositionAsync(0, 0, -2, 2).join()
+        self.drone.takeoffAsync().join()
+        self.drone.moveToPositionAsync(initX, initY, initZ, 5).join()
+        self.drone.moveByVelocityAsync(1, -0.67, -0.8, 5).join()
+        time.sleep(0.5)
 
     def _get_obs(self):
         response = self.drone.simGetImages([self.image_request])
@@ -72,51 +47,27 @@ def _get_obs(self):
             np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
             self.image_shape,
         )
-        _drone_state = self.drone.getMultirotorState()
-        position = _drone_state.kinematics_estimated.position.to_numpy_array()
-        collision = self.drone.simGetCollisionInfo().has_collided
+        self.drone_state = self.drone.getMultirotorState()
 
         self.state["prev_position"] = self.state["position"]
-        self.state["position"] = position
+        self.state["position"] = self.drone_state.kinematics_estimated.position
+        self.state["velocity"] = self.drone_state.kinematics_estimated.linear_velocity
+
+        collision = self.drone.simGetCollisionInfo().has_collided
         self.state["collision"] = collision
 
         return image
 
     def _do_action(self, action):
-        if self.action_type is "discrete":
-            command = self.action_to_command(action)
-            if self.control_type is "position":
-                pos = self.state["position"]
-                self.pose.position.x_val = float(pos[0] + action[0])
-                self.pose.position.y_val = float(pos[1] + action[1])
-                self.pose.position.z_val = float(pos[2] + action[2])
-                self.drone.simSetVehiclePose(self.pose, False)
-
-            elif self.control_type is "velocity":
-                self.drone.moveByVelocityZAsync(
-                    float(action[0]),
-                    float(action[1]),
-                    float(action[2]),
-                    self.step_length,
-                ).join()
-
-        elif self.action_type is "continuous":
-            if self.control_type is "rate":
-                self.drone.moveByRollPitchYawrateThrottleAsync(
-                    float(action[0]),
-                    float(action[1]),
-                    float(action[2]),
-                    float(action[3]),
-                    self.step_length,
-                ).join()
-
-            elif self.control_type == "pwm":
-                self.drone.moveByMotorPWMsAsync(
-                    float(action[0]),
-                    float(action[1]),
-                    float(action[2]),
-                    float(action[3]),
-                )
+        quad_offset = self.interpret_action(action)
+        quad_vel = self.drone.getMultirotorState().kinematics_estimated.linear_velocity
+        self.drone.moveByVelocityAsync(
+            quad_vel.x_val + quad_offset[0],
+            quad_vel.y_val + quad_offset[1],
+            quad_vel.z_val + quad_offset[2],
+            5,
+        ).join()
+        time.sleep(0.5)
 
     def compute_reward(self):
         thresh_dist = 7
@@ -131,7 +82,15 @@ def compute_reward(self):
             np.array([541.3474, 143.6714, -32.07256]),
         ]
 
-        quad_pt = np.array(list((quad_state.x_val, quad_state.y_val, quad_state.z_val)))
+        quad_pt = np.array(
+            list(
+                (
+                    self.state["position"].x_val,
+                    self.state["position"].y_val,
+                    self.state["position"].z_val,
+                )
+            )
+        )
 
         if collision_info.has_collided:
             reward = -100
@@ -150,7 +109,13 @@ def compute_reward(self):
             else:
                 reward_dist = math.exp(-beta * dist) - 0.5
                 reward_speed = (
-                    np.linalg.norm([quad_vel.x_val, quad_vel.y_val, quad_vel.z_val])
+                    np.linalg.norm(
+                        [
+                            self.state["velocity"].x_val,
+                            self.state["velocity"].y_val,
+                            self.state["velocity"].z_val,
+                        ]
+                    )
                     - 0.5
                 )
                 reward = reward_dist + reward_speed
@@ -172,3 +137,22 @@ def step(self, action):
     def reset(self):
         self._setup_flight()
         self._get_obs()
+
+    def interpret_action(self, action):
+        scaling_factor = 0.25
+        if action == 0:
+            quad_offset = (0, 0, 0)
+        elif action == 1:
+            quad_offset = (scaling_factor, 0, 0)
+        elif action == 2:
+            quad_offset = (0, scaling_factor, 0)
+        elif action == 3:
+            quad_offset = (0, 0, scaling_factor)
+        elif action == 4:
+            quad_offset = (-scaling_factor, 0, 0)
+        elif action == 5:
+            quad_offset = (0, -scaling_factor, 0)
+        elif action == 6:
+            quad_offset = (0, 0, -scaling_factor)
+
+        return quad_offset

From a65d808b305e0386cee272d2cbab59175e3cb0bd Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Tue, 15 Dec 2020 13:29:27 -0800
Subject: [PATCH 07/14] DQN car example working

---
 .../airgym/envs/airsim_env.py                 |  1 -
 .../airgym/envs/car_env.py                    | 51 +++++++++++++++----
 .../airgym/envs/drone_env.py                  |  2 +-
 .../reinforcement_learning/dqn_car.py         | 20 +++++++-
 4 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py b/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
index 5a8c45e7d9..7fb4cf221c 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/airsim_env.py
@@ -3,7 +3,6 @@
 
 import gym
 from gym import spaces
-from gym.utils import seeding
 
 
 class AirSimEnv(gym.Env):
diff --git a/PythonClient/reinforcement_learning/airgym/envs/car_env.py b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
index 0ccc1cadf9..8e42955785 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/car_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
@@ -3,7 +3,9 @@
 import numpy as np
 import math
 import time
-from argparse import ArgumentParser
+
+import gym
+from gym import spaces
 from airgym.envs.airsim_env import AirSimEnv
 
 
@@ -24,7 +26,7 @@ def __init__(self, ip_address, image_shape):
         self.action_space = spaces.Discrete(6)
 
         self.image_request = airsim.ImageRequest(
-            "front_center", airsim.ImageType.Scene, False, False
+            "0", airsim.ImageType.DepthPerspective, True, False
         )
 
         self.car_controls = airsim.CarControls()
@@ -34,12 +36,18 @@ def __init__(self, ip_address, image_shape):
         self.state["prev_pose"] = None
         self.state["collision"] = None
 
+    def _setup_car(self):
+        self.car.reset()
+        self.car.enableApiControl(True)
+        self.car.armDisarm(True)
+        time.sleep(0.01)
+
     def __del__(self):
         self.car.reset()
 
     def _do_action(self, action):
         self.car_controls.brake = 0
-        self.car_controls.throttle = 1
+        self.car_controls.throttle = 5
         if action == 0:
             self.car_controls.throttle = 0
             self.car_controls.brake = 1
@@ -55,13 +63,24 @@ def _do_action(self, action):
             self.car_controls.steering = -0.25
 
         self.car.setCarControls(self.car_controls)
+        time.sleep(0.5)
+
+    def transform_obs(self, responses):
+        img1d = np.array(responses[0].image_data_float, dtype=np.float)
+        img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
+        img2d = np.reshape(img1d, (responses[0].height, responses[0].width))
+
+        from PIL import Image
+
+        image = Image.fromarray(img2d)
+        im_final = np.array(image.resize((84, 84)).convert("L"))
+
+        return im_final.reshape([84, 84, 1])
 
     def _get_obs(self):
-        response = self.drone.simGetImages([self.image_request])
-        image = np.reshape(
-            np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
-            self.image_shape,
-        )
+        responses = self.car.simGetImages([self.image_request])
+        image = self.transform_obs(responses)
+
         self.car_state = self.car.getCarState()
         collision = self.car.simGetCollisionInfo().has_collided
 
@@ -114,7 +133,21 @@ def _compute_reward(self):
         if reward < -1:
             done = 1
         if self.car_controls.brake == 0:
-            if self.car_state.speed <= 5:
+            if self.car_state.speed <= 1:
                 done = 1
+        if self.state["collision"]:
+            done = 1
 
         return reward, done
+
+    def step(self, action):
+        self._do_action(action)
+        obs = self._get_obs()
+        reward, done = self._compute_reward()
+
+        return obs, reward, done, self.state
+
+    def reset(self):
+        self._setup_car()
+        self._do_action(1)
+        return self._get_obs()
diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index 3f20ab8cf5..2954b5ab5d 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -136,7 +136,7 @@ def step(self, action):
 
     def reset(self):
         self._setup_flight()
-        self._get_obs()
+        return self._get_obs()
 
     def interpret_action(self, action):
         scaling_factor = 0.25
diff --git a/PythonClient/reinforcement_learning/dqn_car.py b/PythonClient/reinforcement_learning/dqn_car.py
index a16d30c3d2..fb9427606f 100644
--- a/PythonClient/reinforcement_learning/dqn_car.py
+++ b/PythonClient/reinforcement_learning/dqn_car.py
@@ -19,7 +19,7 @@
             gym.make(
                 "airgym:airsim-car-drive-v0",
                 ip_address="127.0.0.1",
-                image_shape=(64, 64, 3),
+                image_shape=(84, 84, 1),
             )
         )
     ]
@@ -41,3 +41,21 @@
     device="cuda",
     tensorboard_log="./tb_logs/",
 )
+
+
+callbacks = []
+eval_callback = EvalCallback(
+    env,
+    callback_on_new_best=None,
+    n_eval_episodes=5,
+    best_model_save_path=".",
+    log_path=".",
+    eval_freq=10000,
+)
+callbacks.append(eval_callback)
+
+kwargs = {}
+kwargs["callback"] = callbacks
+
+model.learn(total_timesteps=5e5, tb_log_name="dqn_run_" + str(time.time()), **kwargs)
+model.save("dqn_airsim_car")

From 5513f192d3f8b4bfda65d8768365a9d6e75950e4 Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Tue, 15 Dec 2020 17:11:52 -0800
Subject: [PATCH 08/14] DQN drone example working

---
 .../reinforcement_learning/airgym/__init__.py |  4 +-
 .../airgym/envs/drone_env.py                  | 56 +++++++++++--------
 .../reinforcement_learning/dqn_car.py         |  2 +-
 .../reinforcement_learning/dqn_drone.py       | 28 ++++++++--
 4 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/__init__.py b/PythonClient/reinforcement_learning/airgym/__init__.py
index 08cf191667..020d393515 100644
--- a/PythonClient/reinforcement_learning/airgym/__init__.py
+++ b/PythonClient/reinforcement_learning/airgym/__init__.py
@@ -1,9 +1,9 @@
 from gym.envs.registration import register
 
 register(
-    id="airsim-drone-trackpowerlines-v0", entry_point="airgym.envs:AirSimDroneEnv",
+    id="airsim-drone-sample-v0", entry_point="airgym.envs:AirSimDroneEnv",
 )
 
 register(
-    id="airsim-car-drive-v0", entry_point="airgym.envs:AirSimCarEnv",
+    id="airsim-car-sample-v0", entry_point="airgym.envs:AirSimCarEnv",
 )
diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index 2954b5ab5d..938a4a8ba5 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -4,16 +4,16 @@
 import math
 import time
 from argparse import ArgumentParser
+
+import gym
+from gym import spaces
 from airgym.envs.airsim_env import AirSimEnv
 
 
 class AirSimDroneEnv(AirSimEnv):
-    def __init__(self, ip_address, image_shape):
+    def __init__(self, ip_address, step_length, image_shape):
         super().__init__(image_shape)
-
         self.step_length = step_length
-        self.action_type = action_type
-        self.control_type = control_type
         self.image_shape = image_shape
 
         self.state = {
@@ -23,10 +23,11 @@ def __init__(self, ip_address, image_shape):
         }
 
         self.drone = airsim.MultirotorClient(ip=ip_address)
+        self.action_space = spaces.Discrete(6)
         self._setup_flight()
 
         self.image_request = airsim.ImageRequest(
-            "front_center", airsim.ImageType.Scene, False, False
+            "0", airsim.ImageType.DepthPerspective, True, False
         )
 
     def __del__(self):
@@ -37,16 +38,27 @@ def _setup_flight(self):
         self.drone.enableApiControl(True)
         self.drone.armDisarm(True)
         self.drone.takeoffAsync().join()
-        self.drone.moveToPositionAsync(initX, initY, initZ, 5).join()
+
+        # Set home position and velocity
+        self.drone.moveToPositionAsync(-0.55265, -31.9786, -19.0225, 5).join()
         self.drone.moveByVelocityAsync(1, -0.67, -0.8, 5).join()
         time.sleep(0.5)
 
+    def transform_obs(self, responses):
+        img1d = np.array(responses[0].image_data_float, dtype=np.float)
+        img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
+        img2d = np.reshape(img1d, (responses[0].height, responses[0].width))
+
+        from PIL import Image
+
+        image = Image.fromarray(img2d)
+        im_final = np.array(image.resize((84, 84)).convert("L"))
+
+        return im_final.reshape([84, 84, 1])
+
     def _get_obs(self):
-        response = self.drone.simGetImages([self.image_request])
-        image = np.reshape(
-            np.fromstring(response[0].image_data_uint8, dtype=np.uint8),
-            self.image_shape,
-        )
+        responses = self.drone.simGetImages([self.image_request])
+        image = self.transform_obs(responses)
         self.drone_state = self.drone.getMultirotorState()
 
         self.state["prev_position"] = self.state["position"]
@@ -69,7 +81,7 @@ def _do_action(self, action):
         ).join()
         time.sleep(0.5)
 
-    def compute_reward(self):
+    def _compute_reward(self):
         thresh_dist = 7
         beta = 1
 
@@ -92,7 +104,7 @@ def compute_reward(self):
             )
         )
 
-        if collision_info.has_collided:
+        if self.state["collision"]:
             reward = -100
         else:
             dist = 10000000
@@ -123,7 +135,6 @@ def compute_reward(self):
         done = 0
         if reward <= -10:
             done = 1
-        return done
 
         return reward, done
 
@@ -139,20 +150,19 @@ def reset(self):
         return self._get_obs()
 
     def interpret_action(self, action):
-        scaling_factor = 0.25
         if action == 0:
-            quad_offset = (0, 0, 0)
+            quad_offset = (self.step_length, 0, 0)
         elif action == 1:
-            quad_offset = (scaling_factor, 0, 0)
+            quad_offset = (0, self.step_length, 0)
         elif action == 2:
-            quad_offset = (0, scaling_factor, 0)
+            quad_offset = (0, 0, self.step_length)
         elif action == 3:
-            quad_offset = (0, 0, scaling_factor)
+            quad_offset = (-self.step_length, 0, 0)
         elif action == 4:
-            quad_offset = (-scaling_factor, 0, 0)
+            quad_offset = (0, -self.step_length, 0)
         elif action == 5:
-            quad_offset = (0, -scaling_factor, 0)
-        elif action == 6:
-            quad_offset = (0, 0, -scaling_factor)
+            quad_offset = (0, 0, -self.step_length)
+        else:
+            quad_offset = (0, 0, 0)
 
         return quad_offset
diff --git a/PythonClient/reinforcement_learning/dqn_car.py b/PythonClient/reinforcement_learning/dqn_car.py
index fb9427606f..ada6bb5c21 100644
--- a/PythonClient/reinforcement_learning/dqn_car.py
+++ b/PythonClient/reinforcement_learning/dqn_car.py
@@ -17,7 +17,7 @@
     [
         lambda: Monitor(
             gym.make(
-                "airgym:airsim-car-drive-v0",
+                "airgym:airsim-car-sample-v0",
                 ip_address="127.0.0.1",
                 image_shape=(84, 84, 1),
             )
diff --git a/PythonClient/reinforcement_learning/dqn_drone.py b/PythonClient/reinforcement_learning/dqn_drone.py
index 94ff107705..faeea2c411 100644
--- a/PythonClient/reinforcement_learning/dqn_drone.py
+++ b/PythonClient/reinforcement_learning/dqn_drone.py
@@ -17,12 +17,10 @@
     [
         lambda: Monitor(
             gym.make(
-                "airgym:airsim-drone-trackpowerlines-v0",
+                "airgym:airsim-drone-sample-v0",
                 ip_address="127.0.0.1",
-                action_type="discrete",
-                control_type="position",
                 step_length=0.25,
-                image_shape=(64, 64, 3),
+                image_shape=(84, 84, 1),
             )
         )
     ]
@@ -44,3 +42,25 @@
     device="cuda",
     tensorboard_log="./tb_logs/",
 )
+
+
+callbacks = []
+eval_callback = EvalCallback(
+    env,
+    callback_on_new_best=None,
+    n_eval_episodes=5,
+    best_model_save_path=".",
+    log_path=".",
+    eval_freq=10000,
+)
+callbacks.append(eval_callback)
+
+kwargs = {}
+kwargs["callback"] = callbacks
+
+model.learn(
+    total_timesteps=5e5,
+    tb_log_name="dqn_airsim_drone_run_" + str(time.time()),
+    **kwargs
+)
+model.save("dqn_airsim_drone_policy")

From 9eaeb73a1c3fbfa4f72307b7471e501b3a23dcf1 Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Wed, 16 Dec 2020 13:22:35 -0800
Subject: [PATCH 09/14] Remove legacy scripts, doc update

---
 PythonClient/car/DQNcar.py                    | 536 -----------------
 PythonClient/multirotor/DQNdrone.py           | 545 ------------------
 .../reinforcement_learning/dqn_car.py         |   6 +-
 docs/reinforcement_learning.md                | 136 +++--
 4 files changed, 97 insertions(+), 1126 deletions(-)
 delete mode 100644 PythonClient/car/DQNcar.py
 delete mode 100644 PythonClient/multirotor/DQNdrone.py

diff --git a/PythonClient/car/DQNcar.py b/PythonClient/car/DQNcar.py
deleted file mode 100644
index 5c48da308c..0000000000
--- a/PythonClient/car/DQNcar.py
+++ /dev/null
@@ -1,536 +0,0 @@
-import setup_path 
-import airsim
-
-import math
-import time
-from argparse import ArgumentParser
-
-#import gym #pip install gym
-import numpy as np
-from cntk.core import Value #pip install cntk-gpu
-from cntk.initializer import he_uniform
-from cntk.layers import Sequential, Convolution2D, Dense, default_options
-from cntk.layers.typing import Signature, Tensor
-from cntk.learners import adam, learning_rate_schedule, momentum_schedule, UnitType
-from cntk.logging import TensorBoardProgressWriter
-from cntk.ops import abs, argmax, element_select, less, relu, reduce_max, reduce_sum, square
-from cntk.ops.functions import CloneMethod, Function
-from cntk.train import Trainer
-
-import pickle
-
-class ReplayMemory(object):
-    """
-    ReplayMemory keeps track of the environment dynamic.
-    We store all the transitions (s(t), action, s(t+1), reward, done).
-    The replay memory allows us to efficiently sample minibatches from it, and generate the correct state representation
-    (w.r.t the number of previous frames needed).
-    """
-    def __init__(self, size, sample_shape, history_length=4):
-        self._pos = 0
-        self._count = 0
-        self._max_size = size
-        self._history_length = max(1, history_length)
-        self._state_shape = sample_shape
-        self._states = np.zeros((size,) + sample_shape, dtype=np.float32)
-        self._actions = np.zeros(size, dtype=np.uint8)
-        self._rewards = np.zeros(size, dtype=np.float32)
-        self._terminals = np.zeros(size, dtype=np.float32)
-
-    def __len__(self):
-        """ Returns the number of items currently present in the memory
-        Returns: Int >= 0
-        """
-        return self._count
-
-    def append(self, state, action, reward, done):
-        """ Appends the specified transition to the memory.
-
-        Attributes:
-            state (Tensor[sample_shape]): The state to append
-            action (int): An integer representing the action done
-            reward (float): An integer representing the reward received for doing this action
-            done (bool): A boolean specifying if this state is a terminal (episode has finished)
-        """
-        assert state.shape == self._state_shape, \
-            'Invalid state shape (required: %s, got: %s)' % (self._state_shape, state.shape)
-
-        self._states[self._pos] = state
-        self._actions[self._pos] = action
-        self._rewards[self._pos] = reward
-        self._terminals[self._pos] = done
-
-        self._count = max(self._count, self._pos + 1)
-        self._pos = (self._pos + 1) % self._max_size
-
-    def sample(self, size):
-        """ Generate size random integers mapping indices in the memory.
-            The returned indices can be retrieved using #get_state().
-            See the method #minibatch() if you want to retrieve samples directly.
-
-        Attributes:
-            size (int): The minibatch size
-
-        Returns:
-             Indexes of the sampled states ([int])
-        """
-
-        # Local variable access is faster in loops
-        count, pos, history_len, terminals = self._count - 1, self._pos, \
-                                             self._history_length, self._terminals
-        indexes = []
-
-        while len(indexes) < size:
-            index = np.random.randint(history_len, count)
-
-            if index not in indexes:
-
-                # if not wrapping over current pointer,
-                # then check if there is terminal state wrapped inside
-                if not (index >= pos > index - history_len):
-                    if not terminals[(index - history_len):index].any():
-                        indexes.append(index)
-
-        return indexes
-
-    def minibatch(self, size):
-        """ Generate a minibatch with the number of samples specified by the size parameter.
-
-        Attributes:
-            size (int): Minibatch size
-
-        Returns:
-            tuple: Tensor[minibatch_size, input_shape...], [int], [float], [bool]
-        """
-        indexes = self.sample(size)
-
-        pre_states = np.array([self.get_state(index) for index in indexes], dtype=np.float32)
-        post_states = np.array([self.get_state(index + 1) for index in indexes], dtype=np.float32)
-        actions = self._actions[indexes]
-        rewards = self._rewards[indexes]
-        dones = self._terminals[indexes]
-
-        return pre_states, actions, post_states, rewards, dones
-
-    def get_state(self, index):
-        """
-        Return the specified state with the replay memory. A state consists of
-        the last `history_length` perceptions.
-
-        Attributes:
-            index (int): State's index
-
-        Returns:
-            State at specified index (Tensor[history_length, input_shape...])
-        """
-        if self._count == 0:
-            raise IndexError('Empty Memory')
-
-        index %= self._count
-        history_length = self._history_length
-
-        # If index > history_length, take from a slice
-        if index >= history_length:
-            return self._states[(index - (history_length - 1)):index + 1, ...]
-        else:
-            indexes = np.arange(index - history_length + 1, index + 1)
-            return self._states.take(indexes, mode='wrap', axis=0)
-
-class History(object):
-    """
-    Accumulator keeping track of the N previous frames to be used by the agent
-    for evaluation
-    """
-
-    def __init__(self, shape):
-        self._buffer = np.zeros(shape, dtype=np.float32)
-
-    @property
-    def value(self):
-        """ Underlying buffer with N previous states stacked along first axis
-
-        Returns:
-            Tensor[shape]
-        """
-        return self._buffer
-
-    def append(self, state):
-        """ Append state to the history
-
-        Attributes:
-            state (Tensor) : The state to append to the memory
-        """
-        self._buffer[:-1] = self._buffer[1:]
-        self._buffer[-1] = state
-
-    def reset(self):
-        """ Reset the memory. Underlying buffer set all indexes to 0
-
-        """
-        self._buffer.fill(0)
-
-class LinearEpsilonAnnealingExplorer(object):
-    """
-    Exploration policy using Linear Epsilon Greedy
-
-    Attributes:
-        start (float): start value
-        end (float): end value
-        steps (int): number of steps between start and end
-    """
-
-    def __init__(self, start, end, steps):
-        self._start = start
-        self._stop = end
-        self._steps = steps
-
-        self._step_size = (end - start) / steps
-
-    def __call__(self, num_actions):
-        """
-        Select a random action out of `num_actions` possibilities.
-
-        Attributes:
-            num_actions (int): Number of actions available
-        """
-        return np.random.choice(num_actions)
-
-    def _epsilon(self, step):
-        """ Compute the epsilon parameter according to the specified step
-
-        Attributes:
-            step (int)
-        """
-        if step < 0:
-            return self._start
-        elif step > self._steps:
-            return self._stop
-        else:
-            return self._step_size * step + self._start
-
-    def is_exploring(self, step):
-        """ Commodity method indicating if the agent should explore
-
-        Attributes:
-            step (int) : Current step
-
-        Returns:
-             bool : True if exploring, False otherwise
-        """
-        return np.random.rand() < self._epsilon(step)
-
-def huber_loss(y, y_hat, delta):
-    """ Compute the Huber Loss as part of the model graph
-
-    Huber Loss is more robust to outliers. It is defined as:
-     if |y - y_hat| < delta :
-        0.5 * (y - y_hat)**2
-    else :
-        delta * |y - y_hat| - 0.5 * delta**2
-
-    Attributes:
-        y (Tensor[-1, 1]): Target value
-        y_hat(Tensor[-1, 1]): Estimated value
-        delta (float): Outliers threshold
-
-    Returns:
-        CNTK Graph Node
-    """
-    half_delta_squared = 0.5 * delta * delta
-    error = y - y_hat
-    abs_error = abs(error)
-
-    less_than = 0.5 * square(error)
-    more_than = (delta * abs_error) - half_delta_squared
-    loss_per_sample = element_select(less(abs_error, delta), less_than, more_than)
-
-    return reduce_sum(loss_per_sample, name='loss')
-
-class DeepQAgent(object):
-    """
-    Implementation of Deep Q Neural Network agent like in:
-        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
-    """
-    def __init__(self, input_shape, nb_actions,
-                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
-                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
-                 memory_size=500000, train_after=200000, train_interval=4, target_update_interval=10000,
-                 monitor=True):
-        self.input_shape = input_shape
-        self.nb_actions = nb_actions
-        self.gamma = gamma
-
-        self._train_after = train_after
-        self._train_interval = train_interval
-        self._target_update_interval = target_update_interval
-
-        self._explorer = explorer
-        self._minibatch_size = minibatch_size
-        self._history = History(input_shape)
-        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
-        self._num_actions_taken = 0
-
-        # Metrics accumulator
-        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []
-
-        # Action Value model (used by agent to interact with the environment)
-        with default_options(activation=relu, init=he_uniform()):
-            self._action_value_net = Sequential([
-                Convolution2D((8, 8), 16, strides=4),
-                Convolution2D((4, 4), 32, strides=2),
-                Convolution2D((3, 3), 32, strides=1),
-                Dense(256, init=he_uniform(scale=0.01)),
-                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
-            ])
-        self._action_value_net.update_signature(Tensor[input_shape])
-
-        # Target model used to compute the target Q-values in training, updated
-        # less frequently for increased stability.
-        self._target_net = self._action_value_net.clone(CloneMethod.freeze)
-
-        # Function computing Q-values targets as part of the computation graph
-        @Function
-        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
-        def compute_q_targets(post_states, rewards, terminals):
-            return element_select(
-                terminals,
-                rewards,
-                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
-            )
-
-        # Define the loss, using Huber Loss (more robust to outliers)
-        @Function
-        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
-                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
-        def criterion(pre_states, actions, post_states, rewards, terminals):
-            # Compute the q_targets
-            q_targets = compute_q_targets(post_states, rewards, terminals)
-
-            # actions is a 1-hot encoding of the action done by the agent
-            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)
-
-            # Define training criterion as the Huber Loss function
-            return huber_loss(q_targets, q_acted, 1.0)
-
-        # Adam based SGD
-        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
-        m_schedule = momentum_schedule(momentum)
-        vm_schedule = momentum_schedule(0.999)
-        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
-                     momentum=m_schedule, variance_momentum=vm_schedule)
-
-        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
-        self._learner = l_sgd
-        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
-
-        #self._trainer.restore_from_checkpoint('models/oldmodels/model800000')
-
-    def act(self, state):
-        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
-        It follows the terminology used in the Nature paper.
-
-        Attributes:
-            state (Tensor[input_shape]): The current environment state
-
-        Returns: Int >= 0 : Next action to do
-        """
-        # Append the state to the short term memory (ie. History)
-        self._history.append(state)
-
-        # If policy requires agent to explore, sample random action
-        if self._explorer.is_exploring(self._num_actions_taken):
-            action = self._explorer(self.nb_actions)
-        else:
-            # Use the network to output the best action
-            env_with_history = self._history.value
-            q_values = self._action_value_net.eval(
-                # Append batch axis with only one sample to evaluate
-                env_with_history.reshape((1,) + env_with_history.shape)
-            )
-
-            self._episode_q_means.append(np.mean(q_values))
-            self._episode_q_stddev.append(np.std(q_values))
-
-            # Return the value maximizing the expected reward
-            action = q_values.argmax()
-
-        # Keep track of interval action counter
-        self._num_actions_taken += 1
-        return action
-
-    def observe(self, old_state, action, reward, done):
-        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state
-
-        Attributes:
-            old_state (Tensor[input_shape]): Previous environment state
-            action (int): Action done by the agent
-            reward (float): Reward for doing this action in the old_state environment
-            done (bool): Indicate if the action has terminated the environment
-        """
-        self._episode_rewards.append(reward)
-
-        # If done, reset short term memory (ie. History)
-        if done:
-            # Plot the metrics through Tensorboard and reset buffers
-            if self._metrics_writer is not None:
-                self._plot_metrics()
-            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []
-
-            # Reset the short term memory
-            self._history.reset()
-
-        # Append to long term memory
-        self._memory.append(old_state, action, reward, done)
-
-    def train(self):
-        """ This allows the agent to train itself to better understand the environment dynamics.
-        The agent will compute the expected reward for the state(t+1)
-        and update the expected reward at step t according to this.
-
-        The target expectation is computed through the Target Network, which is a more stable version
-        of the Action Value Network for increasing training stability.
-
-        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
-        """
-
-        agent_step = self._num_actions_taken
-
-        if agent_step >= self._train_after:
-            if (agent_step % self._train_interval) == 0:
-                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)
-                self._trainer.train_minibatch(
-                    self._trainer.loss_function.argument_map(
-                        pre_states=pre_states,
-                        actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
-                        post_states=post_states,
-                        rewards=rewards,
-                        terminals=terminals
-                    )
-                )
-
-                # Update the Target Network if needed
-                if (agent_step % self._target_update_interval) == 0:
-                    self._target_net = self._action_value_net.clone(CloneMethod.freeze)
-                    filename = "models\model%d" % agent_step
-                    self._trainer.save_checkpoint(filename)
-    
-    def _plot_metrics(self):
-        """Plot current buffers accumulated values to visualize agent learning
-        """
-        if len(self._episode_q_means) > 0:
-            mean_q = np.asscalar(np.mean(self._episode_q_means))
-            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)
-
-        if len(self._episode_q_stddev) > 0:
-            std_q = np.asscalar(np.mean(self._episode_q_stddev))
-            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)
-
-        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
-
-
-def transform_input(responses):
-    img1d = np.array(responses[0].image_data_float, dtype=np.float)
-    img1d = 255/np.maximum(np.ones(img1d.size), img1d)
-    img2d = np.reshape(img1d, (responses[0].height, responses[0].width))
-
-    from PIL import Image
-    image = Image.fromarray(img2d)
-    im_final = np.array(image.resize((84, 84)).convert('L')) 
-
-    return im_final
-
-def interpret_action(action):
-    car_controls.brake = 0
-    car_controls.throttle = 1
-    if action == 0:
-        car_controls.throttle = 0
-        car_controls.brake = 1
-    elif action == 1:
-        car_controls.steering = 0
-    elif action == 2:
-        car_controls.steering = 0.5
-    elif action == 3:
-        car_controls.steering = -0.5
-    elif action == 4:
-        car_controls.steering = 0.25
-    else:
-        car_controls.steering = -0.25
-    return car_controls
-
-
-def compute_reward(car_state):
-    MAX_SPEED = 300
-    MIN_SPEED = 10
-    thresh_dist = 3.5
-    beta = 3
-
-    z = 0
-    pts = [np.array([0, -1, z]), np.array([130, -1, z]), np.array([130, 125, z]), np.array([0, 125, z]), np.array([0, -1, z]), np.array([130, -1, z]), np.array([130, -128, z]), np.array([0, -128, z]), np.array([0, -1, z])]
-    pd = car_state.kinematics_estimated.position
-    car_pt = np.array([pd.x_val, pd.y_val, pd.z_val])
-
-    dist = 10000000
-    for i in range(0, len(pts)-1):
-        dist = min(dist, np.linalg.norm(np.cross((car_pt - pts[i]), (car_pt - pts[i+1])))/np.linalg.norm(pts[i]-pts[i+1]))
-
-    #print(dist)
-    if dist > thresh_dist:
-        reward = -3
-    else:
-        reward_dist = (math.exp(-beta*dist) - 0.5)
-        reward_speed = (((car_state.speed - MIN_SPEED)/(MAX_SPEED - MIN_SPEED)) - 0.5)
-        reward = reward_dist + reward_speed
-
-    return reward
-
-def isDone(car_state, car_controls, reward):
-    done = 0
-    if reward < -1:
-        done = 1
-    if car_controls.brake == 0:
-        if car_state.speed <= 5:
-            done = 1
-    return done
-
-client = airsim.CarClient()
-client.confirmConnection()
-client.enableApiControl(True)
-car_controls = airsim.CarControls()
-
-# Make RL agent
-NumBufferFrames = 4
-SizeRows = 84
-SizeCols = 84
-NumActions = 6
-agent = DeepQAgent((NumBufferFrames, SizeRows, SizeCols), NumActions, monitor=True)
-
-# Train
-epoch = 100
-current_step = 0
-max_steps = epoch * 250000
-
-responses = client.simGetImages([airsim.ImageRequest("0", airsim.ImageType.DepthPerspective, True, False)])
-current_state = transform_input(responses)
-while True:
-    action = agent.act(current_state)
-    car_controls = interpret_action(action)
-    client.setCarControls(car_controls)
-
-    car_state = client.getCarState()
-    reward = compute_reward(car_state) 
-    done = isDone(car_state, car_controls, reward)
-    if done == 1:
-        reward = -10
-
-    agent.observe(current_state, action, reward, done)
-    agent.train()
-
-    if done:
-        client.reset()
-        car_control = interpret_action(1)
-        client.setCarControls(car_control)
-        time.sleep(1)
-        current_step +=1
-
-    responses = client.simGetImages([airsim.ImageRequest("0", airsim.ImageType.DepthPerspective, True, False)])
-    current_state = transform_input(responses)
diff --git a/PythonClient/multirotor/DQNdrone.py b/PythonClient/multirotor/DQNdrone.py
deleted file mode 100644
index 4ca7eb2d6c..0000000000
--- a/PythonClient/multirotor/DQNdrone.py
+++ /dev/null
@@ -1,545 +0,0 @@
-import setup_path 
-import airsim
-
-from argparse import ArgumentParser
-
-import numpy as np
-import time
-import math
-
-from cntk.core import Value
-from cntk.initializer import he_uniform
-from cntk.layers import Sequential, Convolution2D, Dense, default_options
-from cntk.layers.typing import Signature, Tensor
-from cntk.learners import adam, learning_rate_schedule, momentum_schedule, UnitType
-from cntk.logging import TensorBoardProgressWriter
-from cntk.ops import abs, argmax, element_select, less, relu, reduce_max, reduce_sum, square
-from cntk.ops.functions import CloneMethod, Function
-from cntk.train import Trainer
-
-import csv
-
-class ReplayMemory(object):
-    """
-    ReplayMemory keeps track of the environment dynamic.
-    We store all the transitions (s(t), action, s(t+1), reward, done).
-    The replay memory allows us to efficiently sample mini-batches from it, and generate the correct state representation
-    (w.r.t the number of previous frames needed).
-    """
-    def __init__(self, size, sample_shape, history_length=4):
-        self._pos = 0
-        self._count = 0
-        self._max_size = size
-        self._history_length = max(1, history_length)
-        self._state_shape = sample_shape
-        self._states = np.zeros((size,) + sample_shape, dtype=np.float32)
-        self._actions = np.zeros(size, dtype=np.uint8)
-        self._rewards = np.zeros(size, dtype=np.float32)
-        self._terminals = np.zeros(size, dtype=np.float32)
-
-    def __len__(self):
-        """ Returns the number of items currently present in the memory
-        Returns: Int >= 0
-        """
-        return self._count
-
-    def append(self, state, action, reward, done):
-        """ Appends the specified transition to the memory.
-
-        Attributes:
-            state (Tensor[sample_shape]): The state to append
-            action (int): An integer representing the action done
-            reward (float): An integer representing the reward received for doing this action
-            done (bool): A boolean specifying if this state is a terminal (episode has finished)
-        """
-        assert state.shape == self._state_shape, \
-            'Invalid state shape (required: %s, got: %s)' % (self._state_shape, state.shape)
-
-        self._states[self._pos] = state
-        self._actions[self._pos] = action
-        self._rewards[self._pos] = reward
-        self._terminals[self._pos] = done
-
-        self._count = max(self._count, self._pos + 1)
-        self._pos = (self._pos + 1) % self._max_size
-
-    def sample(self, size):
-        """ Generate size random integers mapping indices in the memory.
-            The returned indices can be retrieved using #get_state().
-            See the method #mini-batch() if you want to retrieve samples directly.
-
-        Attributes:
-            size (int): The mini-batch size
-
-        Returns:
-             Indexes of the sampled states ([int])
-        """
-
-        # Local variable access is faster in loops
-        count, pos, history_len, terminals = self._count - 1, self._pos, \
-                                             self._history_length, self._terminals
-        indexes = []
-
-        while len(indexes) < size:
-            index = np.random.randint(history_len, count)
-
-            if index not in indexes:
-
-                # if not wrapping over current pointer,
-                # then check if there is terminal state wrapped inside
-                if not (index >= pos > index - history_len):
-                    if not terminals[(index - history_len):index].any():
-                        indexes.append(index)
-
-        return indexes
-
-    def minibatch(self, size):
-        """ Generate a minibatch with the number of samples specified by the size parameter.
-
-        Attributes:
-            size (int): Minibatch size
-
-        Returns:
-            tuple: Tensor[minibatch_size, input_shape...], [int], [float], [bool]
-        """
-        indexes = self.sample(size)
-
-        pre_states = np.array([self.get_state(index) for index in indexes], dtype=np.float32)
-        post_states = np.array([self.get_state(index + 1) for index in indexes], dtype=np.float32)
-        actions = self._actions[indexes]
-        rewards = self._rewards[indexes]
-        dones = self._terminals[indexes]
-
-        return pre_states, actions, post_states, rewards, dones
-
-    def get_state(self, index):
-        """
-        Return the specified state with the replay memory. A state consists of
-        the last `history_length` perceptions.
-
-        Attributes:
-            index (int): State's index
-
-        Returns:
-            State at specified index (Tensor[history_length, input_shape...])
-        """
-        if self._count == 0:
-            raise IndexError('Empty Memory')
-
-        index %= self._count
-        history_length = self._history_length
-
-        # If index > history_length, take from a slice
-        if index >= history_length:
-            return self._states[(index - (history_length - 1)):index + 1, ...]
-        else:
-            indexes = np.arange(index - history_length + 1, index + 1)
-            return self._states.take(indexes, mode='wrap', axis=0)
-
-class History(object):
-    """
-    Accumulator keeping track of the N previous frames to be used by the agent
-    for evaluation
-    """
-
-    def __init__(self, shape):
-        self._buffer = np.zeros(shape, dtype=np.float32)
-
-    @property
-    def value(self):
-        """ Underlying buffer with N previous states stacked along first axis
-
-        Returns:
-            Tensor[shape]
-        """
-        return self._buffer
-
-    def append(self, state):
-        """ Append state to the history
-
-        Attributes:
-            state (Tensor) : The state to append to the memory
-        """
-        self._buffer[:-1] = self._buffer[1:]
-        self._buffer[-1] = state
-
-    def reset(self):
-        """ Reset the memory. Underlying buffer set all indexes to 0
-
-        """
-        self._buffer.fill(0)
-
-class LinearEpsilonAnnealingExplorer(object):
-    """
-    Exploration policy using Linear Epsilon Greedy
-
-    Attributes:
-        start (float): start value
-        end (float): end value
-        steps (int): number of steps between start and end
-    """
-
-    def __init__(self, start, end, steps):
-        self._start = start
-        self._stop = end
-        self._steps = steps
-
-        self._step_size = (end - start) / steps
-
-    def __call__(self, num_actions):
-        """
-        Select a random action out of `num_actions` possibilities.
-
-        Attributes:
-            num_actions (int): Number of actions available
-        """
-        return np.random.choice(num_actions)
-
-    def _epsilon(self, step):
-        """ Compute the epsilon parameter according to the specified step
-
-        Attributes:
-            step (int)
-        """
-        if step < 0:
-            return self._start
-        elif step > self._steps:
-            return self._stop
-        else:
-            return self._step_size * step + self._start
-
-    def is_exploring(self, step):
-        """ Commodity method indicating if the agent should explore
-
-        Attributes:
-            step (int) : Current step
-
-        Returns:
-             bool : True if exploring, False otherwise
-        """
-        return np.random.rand() < self._epsilon(step)
-
-def huber_loss(y, y_hat, delta):
-    """ Compute the Huber Loss as part of the model graph
-
-    Huber Loss is more robust to outliers. It is defined as:
-     if |y - y_hat| < delta :
-        0.5 * (y - y_hat)**2
-    else :
-        delta * |y - y_hat| - 0.5 * delta**2
-
-    Attributes:
-        y (Tensor[-1, 1]): Target value
-        y_hat(Tensor[-1, 1]): Estimated value
-        delta (float): Outliers threshold
-
-    Returns:
-        CNTK Graph Node
-    """
-    half_delta_squared = 0.5 * delta * delta
-    error = y - y_hat
-    abs_error = abs(error)
-
-    less_than = 0.5 * square(error)
-    more_than = (delta * abs_error) - half_delta_squared
-    loss_per_sample = element_select(less(abs_error, delta), less_than, more_than)
-
-    return reduce_sum(loss_per_sample, name='loss')
-
-class DeepQAgent(object):
-    """
-    Implementation of Deep Q Neural Network agent like in:
-        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
-    """
-    def __init__(self, input_shape, nb_actions,
-                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
-                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
-                 memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000,
-                 monitor=True):
-        self.input_shape = input_shape
-        self.nb_actions = nb_actions
-        self.gamma = gamma
-
-        self._train_after = train_after
-        self._train_interval = train_interval
-        self._target_update_interval = target_update_interval
-
-        self._explorer = explorer
-        self._minibatch_size = minibatch_size
-        self._history = History(input_shape)
-        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
-        self._num_actions_taken = 0
-
-        # Metrics accumulator
-        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []
-
-        # Action Value model (used by agent to interact with the environment)
-        with default_options(activation=relu, init=he_uniform()):
-            self._action_value_net = Sequential([
-                Convolution2D((8, 8), 16, strides=4),
-                Convolution2D((4, 4), 32, strides=2),
-                Convolution2D((3, 3), 32, strides=1),
-                Dense(256, init=he_uniform(scale=0.01)),
-                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
-            ])
-        self._action_value_net.update_signature(Tensor[input_shape])
-
-        # Target model used to compute the target Q-values in training, updated
-        # less frequently for increased stability.
-        self._target_net = self._action_value_net.clone(CloneMethod.freeze)
-
-        # Function computing Q-values targets as part of the computation graph
-        @Function
-        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
-        def compute_q_targets(post_states, rewards, terminals):
-            return element_select(
-                terminals,
-                rewards,
-                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
-            )
-
-        # Define the loss, using Huber Loss (more robust to outliers)
-        @Function
-        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
-                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
-        def criterion(pre_states, actions, post_states, rewards, terminals):
-            # Compute the q_targets
-            q_targets = compute_q_targets(post_states, rewards, terminals)
-
-            # actions is a 1-hot encoding of the action done by the agent
-            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)
-
-            # Define training criterion as the Huber Loss function
-            return huber_loss(q_targets, q_acted, 1.0)
-
-        # Adam based SGD
-        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
-        m_schedule = momentum_schedule(momentum)
-        vm_schedule = momentum_schedule(0.999)
-        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
-                     momentum=m_schedule, variance_momentum=vm_schedule)
-
-        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
-        self._learner = l_sgd
-        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
-
-    def act(self, state):
-        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
-        It follows the terminology used in the Nature paper.
-
-        Attributes:
-            state (Tensor[input_shape]): The current environment state
-
-        Returns: Int >= 0 : Next action to do
-        """
-        # Append the state to the short term memory (ie. History)
-        self._history.append(state)
-
-        # If policy requires agent to explore, sample random action
-        if self._explorer.is_exploring(self._num_actions_taken):
-            action = self._explorer(self.nb_actions)
-        else:
-            # Use the network to output the best action
-            env_with_history = self._history.value
-            q_values = self._action_value_net.eval(
-                # Append batch axis with only one sample to evaluate
-                env_with_history.reshape((1,) + env_with_history.shape)
-            )
-
-            self._episode_q_means.append(np.mean(q_values))
-            self._episode_q_stddev.append(np.std(q_values))
-
-            # Return the value maximizing the expected reward
-            action = q_values.argmax()
-
-        # Keep track of interval action counter
-        self._num_actions_taken += 1
-        return action
-
-    def observe(self, old_state, action, reward, done):
-        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state
-
-        Attributes:
-            old_state (Tensor[input_shape]): Previous environment state
-            action (int): Action done by the agent
-            reward (float): Reward for doing this action in the old_state environment
-            done (bool): Indicate if the action has terminated the environment
-        """
-        self._episode_rewards.append(reward)
-
-        # If done, reset short term memory (ie. History)
-        if done:
-            # Plot the metrics through Tensorboard and reset buffers
-            if self._metrics_writer is not None:
-                self._plot_metrics()
-            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []
-
-            # Reset the short term memory
-            self._history.reset()
-
-        # Append to long term memory
-        self._memory.append(old_state, action, reward, done)
-
-    def train(self):
-        """ This allows the agent to train itself to better understand the environment dynamics.
-        The agent will compute the expected reward for the state(t+1)
-        and update the expected reward at step t according to this.
-
-        The target expectation is computed through the Target Network, which is a more stable version
-        of the Action Value Network for increasing training stability.
-
-        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
-        """
-
-        agent_step = self._num_actions_taken
-
-        if agent_step >= self._train_after:
-            if (agent_step % self._train_interval) == 0:
-                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)
-
-                self._trainer.train_minibatch(
-                    self._trainer.loss_function.argument_map(
-                        pre_states=pre_states,
-                        actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
-                        post_states=post_states,
-                        rewards=rewards,
-                        terminals=terminals
-                    )
-                )
-
-                # Update the Target Network if needed
-                if (agent_step % self._target_update_interval) == 0:
-                    self._target_net = self._action_value_net.clone(CloneMethod.freeze)
-                    filename = "models\model%d" % agent_step
-                    self._trainer.save_checkpoint(filename)
-
-    def _plot_metrics(self):
-        """Plot current buffers accumulated values to visualize agent learning
-        """
-        if len(self._episode_q_means) > 0:
-            mean_q = np.asscalar(np.mean(self._episode_q_means))
-            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)
-
-        if len(self._episode_q_stddev) > 0:
-            std_q = np.asscalar(np.mean(self._episode_q_stddev))
-            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)
-
-        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
-
-def transform_input(responses):
-    img1d = np.array(responses[0].image_data_float, dtype=np.float)
-    img1d = 255/np.maximum(np.ones(img1d.size), img1d)
-    img2d = np.reshape(img1d, (responses[0].height, responses[0].width))
-
-    from PIL import Image
-    image = Image.fromarray(img2d)
-    im_final = np.array(image.resize((84, 84)).convert('L')) 
-
-    return im_final
-
-def interpret_action(action):
-    scaling_factor = 0.25
-    if action == 0:
-        quad_offset = (0, 0, 0)
-    elif action == 1:
-        quad_offset = (scaling_factor, 0, 0)
-    elif action == 2:
-        quad_offset = (0, scaling_factor, 0)
-    elif action == 3:
-        quad_offset = (0, 0, scaling_factor)
-    elif action == 4:
-        quad_offset = (-scaling_factor, 0, 0)    
-    elif action == 5:
-        quad_offset = (0, -scaling_factor, 0)
-    elif action == 6:
-        quad_offset = (0, 0, -scaling_factor)
-    
-    return quad_offset
-
-def compute_reward(quad_state, quad_vel, collision_info):
-    thresh_dist = 7
-    beta = 1
-
-    z = -10
-    pts = [np.array([-.55265, -31.9786, -19.0225]), np.array([48.59735, -63.3286, -60.07256]), np.array([193.5974, -55.0786, -46.32256]), np.array([369.2474, 35.32137, -62.5725]), np.array([541.3474, 143.6714, -32.07256])]
-
-    quad_pt = np.array(list((quad_state.x_val, quad_state.y_val, quad_state.z_val)))
-
-    if collision_info.has_collided:
-        reward = -100
-    else:    
-        dist = 10000000
-        for i in range(0, len(pts)-1):
-            dist = min(dist, np.linalg.norm(np.cross((quad_pt - pts[i]), (quad_pt - pts[i+1])))/np.linalg.norm(pts[i]-pts[i+1]))
-
-        #print(dist)
-        if dist > thresh_dist:
-            reward = -10
-        else:
-            reward_dist = (math.exp(-beta*dist) - 0.5) 
-            reward_speed = (np.linalg.norm([quad_vel.x_val, quad_vel.y_val, quad_vel.z_val]) - 0.5)
-            reward = reward_dist + reward_speed
-
-    return reward
-
-def isDone(reward):
-    done = 0
-    if  reward <= -10:
-        done = 1
-    return done
-
-initX = -.55265
-initY = -31.9786
-initZ = -19.0225
-
-# connect to the AirSim simulator 
-client = airsim.MultirotorClient()
-client.confirmConnection()
-client.enableApiControl(True)
-client.armDisarm(True)
-
-client.takeoffAsync().join()
-client.moveToPositionAsync(initX, initY, initZ, 5).join()
-client.moveByVelocityAsync(1, -0.67, -0.8, 5).join()
-time.sleep(0.5)
-
-# Make RL agent
-NumBufferFrames = 4
-SizeRows = 84
-SizeCols = 84
-NumActions = 7
-agent = DeepQAgent((NumBufferFrames, SizeRows, SizeCols), NumActions, monitor=True)
-
-# Train
-epoch = 100
-current_step = 0
-max_steps = epoch * 250000
-
-responses = client.simGetImages([airsim.ImageRequest(3, airsim.ImageType.DepthPerspective, True, False)])
-current_state = transform_input(responses)
-
-while True:
-    action = agent.act(current_state)
-    quad_offset = interpret_action(action)
-    quad_vel = client.getMultirotorState().kinematics_estimated.linear_velocity
-    client.moveByVelocityAsync(quad_vel.x_val+quad_offset[0], quad_vel.y_val+quad_offset[1], quad_vel.z_val+quad_offset[2], 5).join()
-    time.sleep(0.5)
- 
-    quad_state = client.getMultirotorState().kinematics_estimated.position
-    quad_vel = client.getMultirotorState().kinematics_estimated.linear_velocity
-    collision_info = client.simGetCollisionInfo()
-    reward = compute_reward(quad_state, quad_vel, collision_info)
-    done = isDone(reward)
-    print('Action, Reward, Done:', action, reward, done)
-
-    agent.observe(current_state, action, reward, done)
-    agent.train()
-
-    if done:
-        client.moveToPositionAsync(initX, initY, initZ, 5).join()
-        client.moveByVelocityAsync(1, -0.67, -0.8, 5).join()
-        time.sleep(0.5)
-        current_step +=1
-
-    responses = client.simGetImages([airsim.ImageRequest(3, airsim.ImageType.DepthPerspective, True, False)])
-    current_state = transform_input(responses)
diff --git a/PythonClient/reinforcement_learning/dqn_car.py b/PythonClient/reinforcement_learning/dqn_car.py
index ada6bb5c21..1506510df3 100644
--- a/PythonClient/reinforcement_learning/dqn_car.py
+++ b/PythonClient/reinforcement_learning/dqn_car.py
@@ -57,5 +57,7 @@
 kwargs = {}
 kwargs["callback"] = callbacks
 
-model.learn(total_timesteps=5e5, tb_log_name="dqn_run_" + str(time.time()), **kwargs)
-model.save("dqn_airsim_car")
+model.learn(
+    total_timesteps=5e5, tb_log_name="dqn_airsim_car_run_" + str(time.time()), **kwargs
+)
+model.save("dqn_airsim_car_policy")
diff --git a/docs/reinforcement_learning.md b/docs/reinforcement_learning.md
index bce9fa5aac..782277a2d8 100644
--- a/docs/reinforcement_learning.md
+++ b/docs/reinforcement_learning.md
@@ -1,15 +1,20 @@
 # Reinforcement Learning in AirSim
 
-We below describe how we can implement DQN in AirSim using CNTK. The easiest way is to first install python only CNTK ([instructions](https://docs.microsoft.com/en-us/cognitive-toolkit/setup-windows-python?tabs=cntkpy22)).
+We below describe how we can implement DQN in AirSim using an OpenAI gym wrapper around AirSim API, and using stable baselines. We recommend installing stable-baselines3 in order to run these examples.
 
 CNTK provides several demo examples of [deep RL](https://github.com/Microsoft/CNTK/tree/master/Examples/ReinforcementLearning). We will modify the DeepQNeuralNetwork.py to work with AirSim. We can utilize most of the classes and methods corresponding to the DQN algorithm. However, there are certain additions we need to make for AirSim.
 
 #### Disclaimer
+
 This is still in active development. What we share below is a framework that can be extended and tweaked to obtain better performance.
 
+#### Gym wrapper
+
+In order to use AirSim as a gym environment, we extend and reimplement the base methods such as `step`, `_get_obs`, `_compute_reward` and `reset`. The sample environments for car and drone can be seen in `PythonClient/reinforcement_learning/*_env.py`
+
 ## RL with Car
 
-[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient//car/DQNcar.py)
+[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient/reinforcement_learning/dqn_car.py)
 
 This example works with AirSimNeighborhood environment available in [releases](https://github.com/Microsoft/AirSim/releases).
 
@@ -42,10 +47,10 @@ def interpret_action(action):
     return car_controls
 ```
 
-We then define the reward function in `compute_reward` as a convex combination of how fast the vehicle is travelling and how much it deviates from the center line. The agent gets a high reward when its moving fast and staying in the center of the lane.
+We then define the reward function in `_compute_reward` as a convex combination of how fast the vehicle is travelling and how much it deviates from the center line. The agent gets a high reward when its moving fast and staying in the center of the lane.
 
 ```
-def compute_reward(car_state):
+def _compute_reward(car_state):
     MAX_SPEED = 300
     MIN_SPEED = 10
     thresh_dist = 3.5
@@ -71,21 +76,20 @@ def compute_reward(car_state):
     return reward
 ```
 
-The function `isDone` determines if the episode has terminated (e.g. due to collision). We look at the speed of the vehicle and if it is less than a threshold than the episode is considered to be terminated.
+The compute reward function also subsequently determines if the episode has terminated (e.g. due to collision). We look at the speed of the vehicle and if it is less than a threshold than the episode is considered to be terminated.
 
 ```
-def isDone(car_state, car_controls, reward):
-    done = 0
-    if reward < -1:
+done = 0
+if reward < -1:
+    done = 1
+if car_controls.brake == 0:
+    if car_state.speed <= 5:
         done = 1
-    if car_controls.brake == 0:
-        if car_state.speed <= 5:
-            done = 1
-    return done
+return done
 ```
 
 The main loop then sequences through obtaining the image, computing the action to take according to the current policy, getting a reward and so forth.
-If the episode terminates then we reset the vehicle to the original state via: 
+If the episode terminates then we reset the vehicle to the original state via `reset()`:
 
 ```
 client.reset()
@@ -96,79 +100,125 @@ client.setCarControls(car_control)
 time.sleep(1)
 ```
 
-Note that the simulation needs to be up and running before you execute DQNcar.py. The video below shows first few episodes of DQN training.
+Once the gym-styled environment wrapper is defined as in `car_env.py`, we then make use of stable-baselines3 to run a DQN training loop. The DQN training can be configured as follows, seen in `dqn_car.py`.
+
+```
+model = DQN(
+    "CnnPolicy",
+    env,
+    learning_rate=0.00025,
+    verbose=1,
+    batch_size=32,
+    train_freq=4,
+    target_update_interval=10000,
+    learning_starts=200000,
+    buffer_size=500000,
+    max_grad_norm=10,
+    exploration_fraction=0.1,
+    exploration_final_eps=0.01,
+    device="cuda",
+    tensorboard_log="./tb_logs/",
+)
+```
+
+A training environment and an evaluation envrionment (see `EvalCallback` in `dqn_car.py`) can be defined. The evaluation environoment can be different from training, with different termination conditions/scene configuration. A tensorboard log directory is also defined as part of the DQN parameters. Finally, `model.learn()` starts the DQN training loop. Similarly, implementations of PPO, A3C etc. can be used from stable-baselines3.
+
+Note that the simulation needs to be up and running before you execute `dqn_car.py`. The video below shows first few episodes of DQN training.
 
 [![Reinforcement Learning - Car](images/dqn_car.png)](https://youtu.be/fv-oFPAqSZ4)
 
 ## RL with Quadrotor
 
-[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient//multirotor/DQNdrone.py)
+[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient/reinforcement_learning/dqn_drone.py)
 
 This example works with AirSimMountainLandscape environment available in [releases](https://github.com/Microsoft/AirSim/releases).
 
 We can similarly apply RL for various autonomous flight scenarios with quadrotors. Below is an example on how RL could be used to train quadrotors to follow high tension power lines (e.g. application for energy infrastructure inspection).
-There are seven actions here that correspond to different directions in which the quadrotor can move in (six directions + one hovering action).
+There are six discrete actions here that correspond to different directions in which the quadrotor can move in (six directions + one hovering action).
 
 ```
-def interpret_action(action):
+def interpret_action(self, action):
     if action == 0:
-        quad_offset = (0, 0, 0)
+        quad_offset = (self.step_length, 0, 0)
     elif action == 1:
-        quad_offset = (1, 0, 0)
+        quad_offset = (0, self.step_length, 0)
     elif action == 2:
-        quad_offset = (0, 1, 0)
+        quad_offset = (0, 0, self.step_length)
     elif action == 3:
-        quad_offset = (0, 0, 1)
+        quad_offset = (-self.step_length, 0, 0)
     elif action == 4:
-        quad_offset = (-1, 0, 0)    
+        quad_offset = (0, -self.step_length, 0)
     elif action == 5:
-        quad_offset = (0, -1, 0)
-    elif action == 6:
-        quad_offset = (0, 0, -1)
-    return quad_offset
+        quad_offset = (0, 0, -self.step_length)
+    else:
+        quad_offset = (0, 0, 0)
 ```
 
 The reward again is a function how how fast the quad travels in conjunction with how far it gets from the known powerlines.
 
 ```
 def compute_reward(quad_state, quad_vel, collision_info):
-    thresh_dist = 10
+    thresh_dist = 7
     beta = 1
 
     z = -10
-    pts = [np.array([0, 0, z]), np.array([130, 0, z]), np.array([130, 125, z]), np.array([0, 125, z]), np.array([0, 0, z]), np.array([130, 0, z]), np.array([130, -128, z]), np.array([0, -128, z]), np.array([0, 0, z])]
-    quad_pt = np.array(list((quad_state.x_val, quad_state.y_val, quad_state.z_val)))
+    pts = [np.array([-0.55265, -31.9786, -19.0225]),np.array([48.59735, -63.3286, -60.07256]),np.array([193.5974, -55.0786, -46.32256]),np.array([369.2474, 35.32137, -62.5725]),np.array([541.3474, 143.6714, -32.07256]),]
+
+    quad_pt = np.array(list((self.state["position"].x_val, self.state["position"].y_val,self.state["position"].z_val,)))
 
-    if collision_info.has_collided:
+    if self.state["collision"]:
         reward = -100
-    else:    
+    else:
         dist = 10000000
-        for i in range(0, len(pts)-1):
-            dist = min(dist, np.linalg.norm(np.cross((quad_pt - pts[i]), (quad_pt - pts[i+1])))/np.linalg.norm(pts[i]-pts[i+1]))
+        for i in range(0, len(pts) - 1):
+            dist = min(dist, np.linalg.norm(np.cross((quad_pt - pts[i]), (quad_pt - pts[i + 1]))) / np.linalg.norm(pts[i] - pts[i + 1]))
 
         if dist > thresh_dist:
             reward = -10
         else:
-            reward = 2*(0.5 - math.exp(-beta*dist)) + np.linalg.norm([quad_vel.x_val, quad_vel.y_val, quad_vel.z_val])
-
-    return reward
+            reward_dist = math.exp(-beta * dist) - 0.5
+            reward_speed = (np.linalg.norm([self.state["velocity"].x_val, self.state["velocity"].y_val, self.state["velocity"].z_val,])- 0.5)
+            reward = reward_dist + reward_speed
 ```
 
-We consider an episode to terminate if it drifts too much away from the known power line coordinates. 
+We consider an episode to terminate if it drifts too much away from the known power line coordinates.
 
 The reset function here flies the quadrotor to the initial starting point:
 
 ```
-    if done:
-        client.moveToZAsync(clearZ, 2).join()
-        client.moveToPositionAsync(initX, initY, clearZ, 2).join()
-        client.moveToPositionAsync(initZ, initY, initZ, 2).join()
-        current_step +=1
+client.moveToZAsync(clearZ, 2).join()
+client.moveToPositionAsync(initX, initY, clearZ, 2).join()
+client.moveToPositionAsync(initZ, initY, initZ, 2).join()
+current_step +=1
 ```
 
+Once the gym-styled environment wrapper is defined as in `drone_env.py`, we then make use of stable-baselines3 to run a DQN training loop. The DQN training can be configured as follows, seen in `dqn_drone.py`.
+
+```
+model = DQN(
+    "CnnPolicy",
+    env,
+    learning_rate=0.00025,
+    verbose=1,
+    batch_size=32,
+    train_freq=4,
+    target_update_interval=10000,
+    learning_starts=10000,
+    buffer_size=500000,
+    max_grad_norm=10,
+    exploration_fraction=0.1,
+    exploration_final_eps=0.01,
+    device="cuda",
+    tensorboard_log="./tb_logs/",
+)
+```
+
+A training environment and an evaluation envrionment (see `EvalCallback` in `dqn_drone.py`) can be defined. The evaluation environoment can be different from training, with different termination conditions/scene configuration. A tensorboard log directory is also defined as part of the DQN parameters. Finally, `model.learn()` starts the DQN training loop. Similarly, implementations of PPO, A3C etc. can be used from stable-baselines3.
+
 Here is the video of first few episodes during the training.
 
 [![Reinforcement Learning - Quadrotor](images/dqn_quadcopter.png)](https://youtu.be/uKm15Y3M1Nk)
 
 ## Related
- Please also see [The Autonomous Driving Cookbook](https://aka.ms/AutonomousDrivingCookbook) by Microsoft Deep Learning and Robotics Garage Chapter.
+
+Please also see [The Autonomous Driving Cookbook](https://aka.ms/AutonomousDrivingCookbook) by Microsoft Deep Learning and Robotics Garage Chapter.

From 534b63c41e998ff047d32e556288d6483ee2b29c Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Wed, 16 Dec 2020 13:25:38 -0800
Subject: [PATCH 10/14] Doc update

---
 docs/reinforcement_learning.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/reinforcement_learning.md b/docs/reinforcement_learning.md
index 782277a2d8..27733d0d8d 100644
--- a/docs/reinforcement_learning.md
+++ b/docs/reinforcement_learning.md
@@ -1,8 +1,6 @@
 # Reinforcement Learning in AirSim
 
-We below describe how we can implement DQN in AirSim using an OpenAI gym wrapper around AirSim API, and using stable baselines. We recommend installing stable-baselines3 in order to run these examples.
-
-CNTK provides several demo examples of [deep RL](https://github.com/Microsoft/CNTK/tree/master/Examples/ReinforcementLearning). We will modify the DeepQNeuralNetwork.py to work with AirSim. We can utilize most of the classes and methods corresponding to the DQN algorithm. However, there are certain additions we need to make for AirSim.
+We below describe how we can implement DQN in AirSim using an OpenAI gym wrapper around AirSim API, and using stable baselines implementations of standard RL algorithms. We recommend installing stable-baselines3 in order to run these examples (please see https://github.com/DLR-RM/stable-baselines3)
 
 #### Disclaimer
 
@@ -10,11 +8,11 @@ This is still in active development. What we share below is a framework that can
 
 #### Gym wrapper
 
-In order to use AirSim as a gym environment, we extend and reimplement the base methods such as `step`, `_get_obs`, `_compute_reward` and `reset`. The sample environments for car and drone can be seen in `PythonClient/reinforcement_learning/*_env.py`
+In order to use AirSim as a gym environment, we extend and reimplement the base methods such as `step`, `_get_obs`, `_compute_reward` and `reset` specific to AirSim and the task of interest. The sample environments used in these examples for car and drone can be seen in `PythonClient/reinforcement_learning/*_env.py`
 
 ## RL with Car
 
-[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient/reinforcement_learning/dqn_car.py)
+[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient/reinforcement_learning)
 
 This example works with AirSimNeighborhood environment available in [releases](https://github.com/Microsoft/AirSim/releases).
 
@@ -129,7 +127,7 @@ Note that the simulation needs to be up and running before you execute `dqn_car.
 
 ## RL with Quadrotor
 
-[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient/reinforcement_learning/dqn_drone.py)
+[Source code](https://github.com/Microsoft/AirSim/tree/master/PythonClient/reinforcement_learning)
 
 This example works with AirSimMountainLandscape environment available in [releases](https://github.com/Microsoft/AirSim/releases).
 

From 36261b81b6d9913fc58066728d80546d0de78ab9 Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Wed, 16 Dec 2020 16:41:05 -0800
Subject: [PATCH 11/14] Make RL loop faster for drone, cleanup train script

---
 .../airgym/envs/drone_env.py                  |  6 +----
 .../reinforcement_learning/dqn_car.py         | 20 ++++++++++-------
 .../reinforcement_learning/dqn_drone.py       | 22 +++++++++++--------
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index 938a4a8ba5..9532a0b064 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -37,12 +37,10 @@ def _setup_flight(self):
         self.drone.reset()
         self.drone.enableApiControl(True)
         self.drone.armDisarm(True)
-        self.drone.takeoffAsync().join()
 
         # Set home position and velocity
-        self.drone.moveToPositionAsync(-0.55265, -31.9786, -19.0225, 5).join()
+        self.drone.moveToPositionAsync(-0.55265, -31.9786, -19.0225, 10).join()
         self.drone.moveByVelocityAsync(1, -0.67, -0.8, 5).join()
-        time.sleep(0.5)
 
     def transform_obs(self, responses):
         img1d = np.array(responses[0].image_data_float, dtype=np.float)
@@ -79,7 +77,6 @@ def _do_action(self, action):
             quad_vel.z_val + quad_offset[2],
             5,
         ).join()
-        time.sleep(0.5)
 
     def _compute_reward(self):
         thresh_dist = 7
@@ -115,7 +112,6 @@ def _compute_reward(self):
                     / np.linalg.norm(pts[i] - pts[i + 1]),
                 )
 
-            # print(dist)
             if dist > thresh_dist:
                 reward = -10
             else:
diff --git a/PythonClient/reinforcement_learning/dqn_car.py b/PythonClient/reinforcement_learning/dqn_car.py
index 1506510df3..49d96e515f 100644
--- a/PythonClient/reinforcement_learning/dqn_car.py
+++ b/PythonClient/reinforcement_learning/dqn_car.py
@@ -1,18 +1,15 @@
 import setup_path
 import gym
 import airgym
-
-import numpy as np
-import matplotlib.pyplot as plt
 import time
-import torch
 
-from stable_baselines3 import DQN, PPO
+from stable_baselines3 import DQN
 from stable_baselines3.common.monitor import Monitor
-from stable_baselines3.common.vec_env import DummyVecEnv
+from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
 from stable_baselines3.common.evaluation import evaluate_policy
-from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
+from stable_baselines3.common.callbacks import EvalCallback
 
+# Create a DummyVecEnv for main airsim gym env
 env = DummyVecEnv(
     [
         lambda: Monitor(
@@ -25,6 +22,10 @@
     ]
 )
 
+# Wrap env as VecTransposeImage to allow SB to handle frame observations
+env = VecTransposeImage(env)
+
+# Initialize RL algorithm type and parameters
 model = DQN(
     "CnnPolicy",
     env,
@@ -42,7 +43,7 @@
     tensorboard_log="./tb_logs/",
 )
 
-
+# Create an evaluation callback with the same env, called every 10000 iterations
 callbacks = []
 eval_callback = EvalCallback(
     env,
@@ -57,7 +58,10 @@
 kwargs = {}
 kwargs["callback"] = callbacks
 
+# Train for a certain number of timesteps
 model.learn(
     total_timesteps=5e5, tb_log_name="dqn_airsim_car_run_" + str(time.time()), **kwargs
 )
+
+# Save policy weights
 model.save("dqn_airsim_car_policy")
diff --git a/PythonClient/reinforcement_learning/dqn_drone.py b/PythonClient/reinforcement_learning/dqn_drone.py
index faeea2c411..d17bb8567c 100644
--- a/PythonClient/reinforcement_learning/dqn_drone.py
+++ b/PythonClient/reinforcement_learning/dqn_drone.py
@@ -1,18 +1,15 @@
 import setup_path
 import gym
 import airgym
-
-import numpy as np
-import matplotlib.pyplot as plt
 import time
-import torch
 
-from stable_baselines3 import DQN, PPO
+from stable_baselines3 import DQN
 from stable_baselines3.common.monitor import Monitor
-from stable_baselines3.common.vec_env import DummyVecEnv
+from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
 from stable_baselines3.common.evaluation import evaluate_policy
-from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
+from stable_baselines3.common.callbacks import EvalCallback
 
+# Create a DummyVecEnv for main airsim gym env
 env = DummyVecEnv(
     [
         lambda: Monitor(
@@ -26,6 +23,10 @@
     ]
 )
 
+# Wrap env as VecTransposeImage to allow SB to handle frame observations
+env = VecTransposeImage(env)
+
+# Initialize RL algorithm type and parameters
 model = DQN(
     "CnnPolicy",
     env,
@@ -43,7 +44,7 @@
     tensorboard_log="./tb_logs/",
 )
 
-
+# Create an evaluation callback with the same env, called every 10000 iterations
 callbacks = []
 eval_callback = EvalCallback(
     env,
@@ -51,16 +52,19 @@
     n_eval_episodes=5,
     best_model_save_path=".",
     log_path=".",
-    eval_freq=10000,
+    eval_freq=1,
 )
 callbacks.append(eval_callback)
 
 kwargs = {}
 kwargs["callback"] = callbacks
 
+# Train for a certain number of timesteps
 model.learn(
     total_timesteps=5e5,
     tb_log_name="dqn_airsim_drone_run_" + str(time.time()),
     **kwargs
 )
+
+# Save policy weights
 model.save("dqn_airsim_drone_policy")

From 4172290b4392490cef2abb717b22634179a88c3e Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Thu, 17 Dec 2020 10:36:03 -0800
Subject: [PATCH 12/14] Fix action space in drone env, doc update

---
 .../airgym/envs/drone_env.py                      |  2 +-
 docs/reinforcement_learning.md                    | 15 +++------------
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index 9532a0b064..9352ad1cd0 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -23,7 +23,7 @@ def __init__(self, ip_address, step_length, image_shape):
         }
 
         self.drone = airsim.MultirotorClient(ip=ip_address)
-        self.action_space = spaces.Discrete(6)
+        self.action_space = spaces.Discrete(7)
         self._setup_flight()
 
         self.image_request = airsim.ImageRequest(
diff --git a/docs/reinforcement_learning.md b/docs/reinforcement_learning.md
index 27733d0d8d..69d9c24c98 100644
--- a/docs/reinforcement_learning.md
+++ b/docs/reinforcement_learning.md
@@ -23,7 +23,7 @@ responses = client.simGetImages([ImageRequest(0, AirSimImageType.DepthPerspectiv
 current_state = transform_input(responses)
 ```
 
-We further define the six actions (breaking, straight with throttle, full-left with throttle, full-right with throttle, half-left with throttle, half-right with throttle) that an agent can execute. This is done via the function `interpret_action`:
+We further define the six actions (brake, straight with throttle, full-left with throttle, full-right with throttle, half-left with throttle, half-right with throttle) that an agent can execute. This is done via the function `interpret_action`:
 
 ```
 def interpret_action(action):
@@ -132,7 +132,7 @@ Note that the simulation needs to be up and running before you execute `dqn_car.
 This example works with AirSimMountainLandscape environment available in [releases](https://github.com/Microsoft/AirSim/releases).
 
 We can similarly apply RL for various autonomous flight scenarios with quadrotors. Below is an example on how RL could be used to train quadrotors to follow high tension power lines (e.g. application for energy infrastructure inspection).
-There are six discrete actions here that correspond to different directions in which the quadrotor can move in (six directions + one hovering action).
+There are seven discrete actions here that correspond to different directions in which the quadrotor can move in (six directions + one hovering action).
 
 ```
 def interpret_action(self, action):
@@ -179,16 +179,7 @@ def compute_reward(quad_state, quad_vel, collision_info):
             reward = reward_dist + reward_speed
 ```
 
-We consider an episode to terminate if it drifts too much away from the known power line coordinates.
-
-The reset function here flies the quadrotor to the initial starting point:
-
-```
-client.moveToZAsync(clearZ, 2).join()
-client.moveToPositionAsync(initX, initY, clearZ, 2).join()
-client.moveToPositionAsync(initZ, initY, initZ, 2).join()
-current_step +=1
-```
+We consider an episode to terminate if it drifts too much away from the known power line coordinates, and then reset the drone to its starting point.
 
 Once the gym-styled environment wrapper is defined as in `drone_env.py`, we then make use of stable-baselines3 to run a DQN training loop. The DQN training can be configured as follows, seen in `dqn_drone.py`.
 

From 475ecdfc83d29cca92b440931ab89cdf9db63d2a Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Thu, 17 Dec 2020 10:46:47 -0800
Subject: [PATCH 13/14] Switch camera type for drone env, fix DQN eval
 frequency

---
 PythonClient/reinforcement_learning/airgym/envs/drone_env.py | 2 +-
 PythonClient/reinforcement_learning/dqn_drone.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
index 9352ad1cd0..5fd6a9515f 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/drone_env.py
@@ -27,7 +27,7 @@ def __init__(self, ip_address, step_length, image_shape):
         self._setup_flight()
 
         self.image_request = airsim.ImageRequest(
-            "0", airsim.ImageType.DepthPerspective, True, False
+            3, airsim.ImageType.DepthPerspective, True, False
         )
 
     def __del__(self):
diff --git a/PythonClient/reinforcement_learning/dqn_drone.py b/PythonClient/reinforcement_learning/dqn_drone.py
index d17bb8567c..845b396859 100644
--- a/PythonClient/reinforcement_learning/dqn_drone.py
+++ b/PythonClient/reinforcement_learning/dqn_drone.py
@@ -52,7 +52,7 @@
     n_eval_episodes=5,
     best_model_save_path=".",
     log_path=".",
-    eval_freq=1,
+    eval_freq=10000,
 )
 callbacks.append(eval_callback)
 

From 804f7fd88908e2de91e0d78a0cb91dc5f371264a Mon Sep 17 00:00:00 2001
From: Sai Vemprala <saihemachandra.v@gmail.com>
Date: Thu, 17 Dec 2020 12:13:22 -0800
Subject: [PATCH 14/14] Minor tweaks to car env

---
 PythonClient/reinforcement_learning/airgym/envs/car_env.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PythonClient/reinforcement_learning/airgym/envs/car_env.py b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
index 8e42955785..7a0487f070 100644
--- a/PythonClient/reinforcement_learning/airgym/envs/car_env.py
+++ b/PythonClient/reinforcement_learning/airgym/envs/car_env.py
@@ -47,7 +47,7 @@ def __del__(self):
 
     def _do_action(self, action):
         self.car_controls.brake = 0
-        self.car_controls.throttle = 5
+        self.car_controls.throttle = 1
         if action == 0:
             self.car_controls.throttle = 0
             self.car_controls.brake = 1
@@ -63,7 +63,7 @@ def _do_action(self, action):
             self.car_controls.steering = -0.25
 
         self.car.setCarControls(self.car_controls)
-        time.sleep(0.5)
+        time.sleep(1)
 
     def transform_obs(self, responses):
         img1d = np.array(responses[0].image_data_float, dtype=np.float)