From b000dbfb75f03c167ac17beb45d591b6b5c42ed0 Mon Sep 17 00:00:00 2001
From: elliottower <etower@umass.edu>
Date: Wed, 1 Feb 2023 04:00:03 -0500
Subject: [PATCH] Cleaned DQN tianshou code, updated command line arguments to
 be consistent, updated README

---
 README.md                                | 36 +++++++++++--------
 gobblet/examples/example_DQN_tianshou.py | 45 ++++++++++++++++++------
 gobblet/examples/example_basic.py        |  4 ++-
 gobblet/examples/example_record_game.py  |  4 +--
 gobblet/examples/example_user_input.py   |  7 ++--
 gobblet/game/manual_policy.py            |  2 ++
 pyproject.toml                           |  2 +-
 7 files changed, 66 insertions(+), 34 deletions(-)
diff --git a/README.md b/README.md
index afd17e0..0d452cb 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@
 
 Interactive Multi-Agent Reinforcement Learning Environment for the [Gobblet](https://themindcafe.com.sg/wp-content/uploads/2018/07/Gobblet-Gobblers.pdf) board game using [PettingZoo](https://github.com/PettingZoo-Team/PettingZoo) and [Pygame](https://github.com/pygame/pygame).
 
+Allows for users to play in the same environment, and even play against RL agents trained with [Tianshou](https://github.com/thu-ml/tianshou).
+
 <p align="center">
   <img alt="Light" src="./gobblet.jpg" width="45%">
 &nbsp; &nbsp; &nbsp; &nbsp;
@@ -39,6 +41,16 @@ from gobblet import gobblet_v1
 env = gobblet_v1.env()
 ```
 
+### Play against a DQL agent trained with Tianshou
+
+In the terminal, run the following:
+```
+python gobblet/example_DQN_tianshou.py --epoch 50 --player 1 --cpu-players 2
+```
+
+This will train a [DQN](https://tianshou.readthedocs.io/en/master/tutorials/dqn.html) model from Tianshou for 50 epochs, and launch an interactive game against the pre-trained agent. 
+
+Use the argument ``--cpu-player`` to determine the number of CPU agents (1 or 2)  and ``--player`` to choose which agent goes first (human first: 0, CPU first: 1)
 
 ### Play an interactive game
 
@@ -47,30 +59,23 @@ In the terminal, run the following:
 python gobblet/examples/example_user_input.py"
 ```
 To select a piece size, press a number key `1`, `2`, or `3`, or press `space` to cycle through pieces. Placing a piece is done by clicking on a square on the board. A preview will appear showing legal moves with the selected piece size. Clicking on an already placed piece will pick it up and prompt you to place it in a new location (re-placing in the same location is an illegal move).
-### Screen recording of a game
+
+### Create screen recording of a game
 
 In the terminal, run the following:
 ```
 python gobblet/examples/example_record_game.py"
 ```
-This will save a screen recording of a game to `gobblet/examples/game.gif` 
+This will save a screen recording of a game to `gobblet/examples/game.gif`
 
-### Display a game between two basic CPU agents
 
-In the terminal, run the following:
-```
-python gobblet/examples/example_basic.py"
-```
-This will launch a game with two agents choosing random actions (other agent types will be added in the future)
-
-### Train a DQL agent with Tianshou
+### Watch a game between two basic CPU agents
 
 In the terminal, run the following:
 ```
-python gobblet/example_tianshou.py
+python gobblet/examples/example_basic.py"
 ```
-
-This will train a [DQN](https://tianshou.readthedocs.io/en/master/tutorials/dqn.html) model from Tianshou for 50 epochs, and then render the trained agent playing against a random agent in an example match.
+This will launch a game with two basic agents choosing random actions. This file can be used as a starting point for prototyping future methods.
 
 
 ### Command Line Arguments
@@ -78,17 +83,20 @@ This will train a [DQN](https://tianshou.readthedocs.io/en/master/tutorials/dqn.
 
 #### Game Modes
 
-`example_user_input.py` and `example_record_game.py` take the following arguments:
+All scripts besides`example_basic.py` (no support for interactive play) take the following arguments:
 
 The default game mode is human vs CPU, with the human playing as red and CPU as yellow. 
 
  ``--player 1`` sets the human player as yellow, with the CPU moving first as red.
 
+``--cpu-players 1`` will launch a game with one CPU agent and one human agent. (default) 
+
 ``--cpu-players 0`` will launch a game with no CPU agents, taking interactive input for both agents. 
 
 ``--cpu-player 2`` will launch a game with two CPU agents, and takes no interactive input.
 
 
+
 #### Display Modes
 
 `example_basic.py` takes the following arguments to change display mode:
diff --git a/gobblet/examples/example_DQN_tianshou.py b/gobblet/examples/example_DQN_tianshou.py
index 796d5a5..7fb2370 100644
--- a/gobblet/examples/example_DQN_tianshou.py
+++ b/gobblet/examples/example_DQN_tianshou.py
@@ -1,6 +1,5 @@
-# Modified from tutorial code
-"""Modified code from Tianshou MARL training example:
-
+# adapted from https://github.com/Farama-Foundation/PettingZoo/blob/master/tutorials/Tianshou/3_cli_and_logging.py
+"""
 This is a full example of using Tianshou with MARL to train agents, complete with argument parsing (CLI) and logging.
 
 Author: Will (https://github.com/WillDudley)
@@ -31,6 +30,7 @@
 
 from gobblet import gobblet_v1
 from gobblet.game.collector_manual_policy import ManualPolicyCollector
+import time
 
 
 def get_parser() -> argparse.ArgumentParser:
@@ -57,9 +57,11 @@ def get_parser() -> argparse.ArgumentParser:
     parser.add_argument("--test-num", type=int, default=10)
     parser.add_argument("--logdir", type=str, default="log")
     parser.add_argument("--render", type=float, default=0.1)
-    parser.add_argument("--render_mode", type=str, default="human", help="options: human, human_full")
+    parser.add_argument("--render_mode", type=str, default="human", choices=["human","rgb_array", "text", "text_full"], help="Choose the rendering mode for the game.")
     parser.add_argument("--debug", action="store_true", help="enable to print extra debugging info")
     parser.add_argument("--self_play", action="store_true", help="enable training via self-play (as opposed to fixed opponent)")
+    parser.add_argument("--cpu-players", type=int, default=2, choices=[1, 2], help="Number of CPU players (options: 1, 2)")
+    parser.add_argument("--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1")
     parser.add_argument(
         "--win-rate",
         type=float,
@@ -267,7 +269,15 @@ def watch(
     else:
         policy.policies[agents[:]].set_eps(args.eps_test)
     collector = Collector(policy, env, exploration_noise=True)
-    result = collector.collect(n_episode=1, render=0)
+
+    # First step (while loop stopping conditions are not defined until we run the first step)
+    result = collector.collect(n_step=1, render=args.render)
+    time.sleep(0.25)
+
+    while not (collector.data.terminated or collector.data.truncated):
+        result = collector.collect(n_step=1, render=args.render)
+        time.sleep(0.25) # Slow down rendering so the actions can be seen sequentially (otherwise moves happen too fast)
+
     rews, lens = result["rews"], result["lens"]
     print(f"Final reward: {rews[:, args.agent_id - 1].mean()}, length: {lens.mean()}")
 
@@ -285,7 +295,7 @@ def watch_selfplay(args, agent):
     print(f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()}")
 
 
-# Allows the user to input moves and play vs the learned agent
+# ======== allows the user to input moves and play vs a pre-trained agent ======
 def play(
         args: argparse.Namespace = get_args(),
         agent_learn: Optional[BasePolicy] = None,
@@ -304,12 +314,22 @@ def play(
     pettingzoo_env = env.workers[0].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env
     manual_policy = gobblet_v1.ManualPolicy(pettingzoo_env) # Gobblet keyboard input requires access to raw_env (uses functions from board)
 
-    # Get the first move from the CPU player
-    result = collector.collect(n_step=1, render=args.render)
+    # Get the first move from the CPU (human goes second))
+    if args.player == 1:
+        result = collector.collect(n_step=1, render=args.render)
+
+    # Get the first move from the player
+    else:
+        observation = {"observation": collector.data.obs.obs,
+                       "action_mask": collector.data.obs.mask}  # PettingZoo expects a dict with this format
+        action = manual_policy(observation, pettingzoo_env.agents[0])
+
+        result = collector.collect_result(action=action.reshape(1), render=args.render)
 
     while not (collector.data.terminated or collector.data.truncated):
         agent_id = collector.data.obs.agent_id
-        if agent_id == pettingzoo_env.agents[1]:
+        # If it is the players turn and there are less than 2 CPU players (at least one human player)
+        if agent_id == pettingzoo_env.agents[args.player]:
             # action_mask = collector.data.obs.mask[0]
             # action = np.random.choice(np.arange(len(action_mask)), p=action_mask / np.sum(action_mask))
             observation = {"observation": collector.data.obs.obs,
@@ -327,5 +347,8 @@ def play(
     # train the agent and watch its performance in a match!
     args = get_args()
     result, agent = train_agent(args)
-    # watch(args, agent)
-    play(args, agent)
+    if args.cpu_players == 2:
+
+        watch(args, agent)
+    else:
+        play(args, agent)
diff --git a/gobblet/examples/example_basic.py b/gobblet/examples/example_basic.py
index 607ff82..2215968 100644
--- a/gobblet/examples/example_basic.py
+++ b/gobblet/examples/example_basic.py
@@ -7,8 +7,10 @@
 def get_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--render-mode", type=str, default="human", help="options: human, rgb_array, text, text_full"
+        "--render_mode", type=str, default="human", choices=["human", "rgb_array", "text", "text_full"],
+                        help="Choose the rendering mode for the game."
     )
+
     parser.add_argument(
         "--seed", type=int, default=None, help="random seed for board and policy"
     )
diff --git a/gobblet/examples/example_record_game.py b/gobblet/examples/example_record_game.py
index ef85409..1aff207 100644
--- a/gobblet/examples/example_record_game.py
+++ b/gobblet/examples/example_record_game.py
@@ -9,10 +9,10 @@ def get_parser() -> argparse.ArgumentParser:
         "--seed", type=int, default=None, help="Set random seed manually (will only affect CPU agents)"
     )
     parser.add_argument(
-        "--cpu-players", type=int, default=1, help="Number of CPU players (options: 1, 2)"
+        "--cpu-players", type=int, default=1, choices=[0, 1, 2], help="Number of CPU players (options: 0, 1, 2)"
     )
     parser.add_argument(
-        "--player", type=int, default=0, help="Choose which player to play as: red = 0, yellow = 1"
+        "--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1"
     )
     parser.add_argument(
         "--screen-width", type=int, default=640, help="Width of pygame screen in pixels"
diff --git a/gobblet/examples/example_user_input.py b/gobblet/examples/example_user_input.py
index e384afd..0ba7f7f 100644
--- a/gobblet/examples/example_user_input.py
+++ b/gobblet/examples/example_user_input.py
@@ -9,10 +9,10 @@ def get_parser() -> argparse.ArgumentParser:
         "--seed", type=int, default=None, help="Set random seed manually (will only affect CPU agents)"
     )
     parser.add_argument(
-        "--cpu-players", type=int, default=1, help="Number of CPU players (options: 1, 2)"
+        "--cpu-players", type=int, default=1, choices=[0, 1, 2], help="Number of CPU players (options: 0, 1, 2)"
     )
     parser.add_argument(
-        "--player", type=int, default=0, help="Choose which player to play as: red = 0, yellow = 1"
+        "--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1"
     )
     parser.add_argument(
         "--screen-width", type=int, default=640, help="Width of pygame screen in pixels"
@@ -38,9 +38,6 @@ def get_args() -> argparse.Namespace:
     env = gobblet_v1.env(render_mode="human", args=args)
     env.reset()
 
-    env.render()  # need to render the environment before pygame can take user input
-
-
     manual_policy = gobblet_v1.ManualPolicy(env)
 
     for agent in env.agent_iter():
diff --git a/gobblet/game/manual_policy.py b/gobblet/game/manual_policy.py
index 9c329ac..d81ed05 100644
--- a/gobblet/game/manual_policy.py
+++ b/gobblet/game/manual_policy.py
@@ -13,6 +13,8 @@ def __init__(self, env, agent_id: int = 0, recorder: GIFRecorder = None):
         self.agent = self.env.agents[self.agent_id]
         self.recorder = recorder
 
+        env.render()  # need to render the environment before pygame can take user input
+
     def __call__(self, observation, agent):
         recorder = self.recorder
         env = self.env
diff --git a/pyproject.toml b/pyproject.toml
index 897d439..f395831 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "gobblet-rl"
-version = "1.2.1"
+version = "1.2.2"
 authors = [
   { name="Elliot Tower", email="elliot@elliottower.com" },
 ]