Merge pull request #249 from lanctot:ubuntu_2004

PiperOrigin-RevId: 318510165 Change-Id: Iff82542b05242b1aec2f187cf73019be517b6bdf
google-deepmind · Jun 26, 2020 · b65db20 · b65db20
2 parents 404cf00 + c3db370
commit b65db20
Show file tree

Hide file tree

Showing 23 changed files with 297 additions and 78 deletions.
diff --git a/open_spiel/python/algorithms/deep_cfr.py b/open_spiel/python/algorithms/deep_cfr.py
@@ -18,7 +18,7 @@
 
 The algorithm defines an `advantage` and `strategy` networks that compute
 advantages used to do regret matching across information sets and to approximate
-the strategy profiles of the game.  To train these networks a fixed ring buffer
+the strategy profiles of the game. To train these networks a fixed ring buffer
 (other data structures may be used) memory is used to accumulate samples to
 train the networks.
 """
@@ -30,12 +30,14 @@
 import collections
 import random
 import numpy as np
-import sonnet as snt
 import tensorflow.compat.v1 as tf
 
 from open_spiel.python import policy
+from open_spiel.python import simple_nets
 import pyspiel
 
+# Temporarily Disable TF2 behavior until we update the code.
+tf.disable_v2_behavior()
 
 AdvantageMemory = collections.namedtuple(
     "AdvantageMemory", "info_state iteration advantage action")
@@ -48,10 +50,8 @@
 class FixedSizeRingBuffer(object):
   """ReplayBuffer of fixed size with a FIFO replacement policy.
 
-  Stored transitions can be sampled uniformly.
-
-  The underlying datastructure is a ring buffer, allowing 0(1) adding and
-  sampling.
+  Stored transitions can be sampled uniformly. The underlying datastructure is a
+  ring buffer, allowing 0(1) adding and sampling.
   """
 
   def __init__(self, replay_buffer_capacity):
@@ -63,7 +63,6 @@ def add(self, element):
     """Adds `element` to the buffer.
 
     If the buffer is full, the oldest element will be replaced.
-
     Args:
       element: data to be added to the buffer.
     """
@@ -79,10 +78,8 @@ def sample(self, num_samples):
 
     Args:
       num_samples: `int`, number of samples to draw.
-
     Returns:
       An iterable over `num_samples` random elements of the buffer.
-
     Raises:
       ValueError: If there are less than `num_samples` elements in the buffer
     """
@@ -121,24 +118,25 @@ def __init__(self,
                game,
                policy_network_layers=(256, 256),
                advantage_network_layers=(128, 128),
-               num_iterations=100,
-               num_traversals=20,
-               learning_rate=1e-4,
+               num_iterations: int = 100,
+               num_traversals: int = 20,
+               learning_rate: float = 1e-4,
                batch_size_advantage=None,
                batch_size_strategy=None,
-               memory_capacity=int(1e6),
-               policy_network_train_steps=1,
-               advantage_network_train_steps=1):
+               memory_capacity: int = int(1e6),
+               policy_network_train_steps: int = 1,
+               advantage_network_train_steps: int = 1,
+               reinitialize_advantage_networks: bool = True):
     """Initialize the Deep CFR algorithm.
 
     Args:
       session: (tf.Session) TensorFlow session.
       game: Open Spiel game.
       policy_network_layers: (list[int]) Layer sizes of strategy net MLP.
       advantage_network_layers: (list[int]) Layer sizes of advantage net MLP.
-      num_iterations: (int) Number of iterations.
-      num_traversals: (int) Number of traversals per iteration.
-      learning_rate: (float) Learning rate.
+      num_iterations: Number of iterations.
+      num_traversals: Number of traversals per iteration.
+      learning_rate: Learning rate.
       batch_size_advantage: (int or None) Batch size to sample from advantage
         memories.
       batch_size_strategy: (int or None) Batch size to sample from strategy
@@ -148,6 +146,8 @@ def __init__(self,
         iteration).
       advantage_network_train_steps: Number of advantage network training steps
         (per iteration).
+      reinitialize_advantage_networks: Whether to re-initialize the
+        advantage network before training on each iteration.
     """
     all_players = list(range(game.num_players()))
     super(DeepCFRSolver, self).__init__(game, all_players)
@@ -166,6 +166,7 @@ def __init__(self,
     self._embedding_size = len(self._root_node.information_state_tensor(0))
     self._num_iterations = num_iterations
     self._num_traversals = num_traversals
+    self._reinitialize_advantage_networks = reinitialize_advantage_networks
     self._num_actions = game.num_distinct_actions()
     self._iteration = 1
 
@@ -194,8 +195,9 @@ def __init__(self,
 
     # Define strategy network, loss & memory.
     self._strategy_memories = FixedSizeRingBuffer(memory_capacity)
-    self._policy_network = snt.nets.MLP(
-        list(policy_network_layers) + [self._num_actions])
+    self._policy_network = simple_nets.MLP(self._embedding_size,
+                                           list(policy_network_layers),
+                                           self._num_actions)
     action_logits = self._policy_network(self._info_state_ph)
     # Illegal actions are handled in the traversal code where expected payoff
     # and sampled regret is computed from the advantage networks.
@@ -212,8 +214,8 @@ def __init__(self,
         FixedSizeRingBuffer(memory_capacity) for _ in range(self._num_players)
     ]
     self._advantage_networks = [
-        snt.nets.MLP(list(advantage_network_layers) + [self._num_actions])
-        for _ in range(self._num_players)
+        simple_nets.MLP(self._embedding_size, list(advantage_network_layers),
+                        self._num_actions) for _ in range(self._num_players)
     ]
     self._advantage_outputs = [
         self._advantage_networks[i](self._info_state_ph)
@@ -251,8 +253,11 @@ def reinitialize_advantage_networks(self):
       self.reinitialize_advantage_network(p)
 
   def reinitialize_advantage_network(self, player):
-    for key in self._advantage_networks[player].initializers:
-      self._advantage_networks[player].initializers[key]()
+    self._session.run(
+        tf.group(*[
+            var.initializer
+            for var in self._advantage_networks[player].variables
+        ]))
 
   def solve(self):
     """Solution logic for Deep CFR."""
@@ -261,8 +266,9 @@ def solve(self):
       for p in range(self._num_players):
         for _ in range(self._num_traversals):
           self._traverse_game_tree(self._root_node, p)
-        self.reinitialize_advantage_network(p)
-        # Re-initialize advantage network for player and train from scratch.
+        if self._reinitialize_advantage_networks:
+          # Re-initialize advantage network for player and train from scratch.
+          self.reinitialize_advantage_network(p)
         advantage_losses[p].append(self._learn_advantage_network(p))
       self._iteration += 1
     # Train policy network.
@@ -274,11 +280,9 @@ def _traverse_game_tree(self, state, player):
 
     Over a traversal the advantage and strategy memories are populated with
     computed advantage values and matched regrets respectively.
-
     Args:
       state: Current OpenSpiel game state.
       player: (int) Player index for this traversal.
-
     Returns:
       Recursively returns expected payoffs for each action.
     """
@@ -327,7 +331,6 @@ def _sample_action_from_advantage(self, state, player):
     Args:
       state: Current OpenSpiel game state.
       player: (int) Player index over which to compute regrets.
-
     Returns:
       1. (list) Advantage values for info state actions indexed by action.
       2. (list) Matched regrets, prob for actions indexed by action.
@@ -366,7 +369,6 @@ def _learn_advantage_network(self, player):
 
     Args:
       player: (int) player index.
-
     Returns:
       The average loss over the advantage network.
     """

diff --git a/open_spiel/python/algorithms/deep_cfr_test.py b/open_spiel/python/algorithms/deep_cfr_test.py
@@ -26,6 +26,9 @@
 from open_spiel.python.algorithms import exploitability
 import pyspiel
 
+# Temporarily disable TF2 behavior until we update the code.
+tf.disable_v2_behavior()
+
 
 class DeepCFRTest(parameterized.TestCase):
 

diff --git a/open_spiel/python/algorithms/dqn.py b/open_spiel/python/algorithms/dqn.py
@@ -21,10 +21,13 @@
 import collections
 import random
 import numpy as np
-import sonnet as snt
 import tensorflow.compat.v1 as tf
 
 from open_spiel.python import rl_agent
+from open_spiel.python import simple_nets
+
+# Temporarily disable TF2 behavior until code is updated.
+tf.disable_v2_behavior()
 
 Transition = collections.namedtuple(
     "Transition",
@@ -122,7 +125,7 @@ def __init__(self,
     self._num_actions = num_actions
     if isinstance(hidden_layers_sizes, int):
       hidden_layers_sizes = [hidden_layers_sizes]
-    self._layer_sizes = hidden_layers_sizes + [num_actions]
+    self._layer_sizes = hidden_layers_sizes
     self._batch_size = batch_size
     self._update_target_network_every = update_target_network_every
     self._learn_every = learn_every
@@ -166,9 +169,12 @@ def __init__(self,
         dtype=tf.float32,
         name="legal_actions_mask_ph")
 
-    self._q_network = snt.nets.MLP(output_sizes=self._layer_sizes)
+    self._q_network = simple_nets.MLP(state_representation_size,
+                                      self._layer_sizes, num_actions)
     self._q_values = self._q_network(self._info_state_ph)
-    self._target_q_network = snt.nets.MLP(output_sizes=self._layer_sizes)
+
+    self._target_q_network = simple_nets.MLP(state_representation_size,
+                                             self._layer_sizes, num_actions)
     self._target_q_values = self._target_q_network(self._next_info_state_ph)
 
     # Stop gradient to prevent updates to the target network while learning
@@ -291,17 +297,21 @@ def _create_target_network_update_op(self, q_network, target_q_network):
     """Create TF ops copying the params of the Q-network to the target network.
 
     Args:
-      q_network: `snt.AbstractModule`. Values are copied from this network.
-      target_q_network: `snt.AbstractModule`. Values are copied to this network.
+      q_network: A q-network object that implements provides the `variables`
+                 property representing the TF variable list.
+      target_q_network: A target q-net object that provides the `variables`
+                        property representing the TF variable list.
 
     Returns:
       A `tf.Operation` that updates the variables of the target.
     """
-    variables = q_network.get_variables()
-    target_variables = target_q_network.get_variables()
+    self._variables = q_network.variables[:]
+    self._target_variables = target_q_network.variables[:]
+    assert self._variables
+    assert len(self._variables) == len(self._target_variables)
     return tf.group([
         tf.assign(target_v, v)
-        for (target_v, v) in zip(target_variables, variables)
+        for (target_v, v) in zip(self._target_variables, self._variables)
     ])
 
   def _epsilon_greedy(self, info_state, legal_actions, epsilon):
@@ -403,9 +413,9 @@ def step_counter(self):
 
   def _initialize(self):
     initialization_weights = tf.group(
-        *[var.initializer for var in self._q_network.variables])
+        *[var.initializer for var in self._variables])
     initialization_target_weights = tf.group(
-        *[var.initializer for var in self._target_q_network.variables])
+        *[var.initializer for var in self._target_variables])
     initialization_opt = tf.group(
         *[var.initializer for var in self._optimizer.variables()])
 

diff --git a/open_spiel/python/algorithms/dqn_test.py b/open_spiel/python/algorithms/dqn_test.py
@@ -24,6 +24,9 @@
 from open_spiel.python.algorithms import dqn
 import pyspiel
 
+# Temporarily disable TF2 behavior until code is updated.
+tf.disable_v2_behavior()
+
 
 class DQNTest(tf.test.TestCase):
 

diff --git a/open_spiel/python/algorithms/eva.py b/open_spiel/python/algorithms/eva.py
@@ -30,12 +30,15 @@
 import collections
 import copy
 import numpy as np
-import sonnet as snt
 import tensorflow.compat.v1 as tf
 
 from open_spiel.python import rl_agent
+from open_spiel.python import simple_nets
 from open_spiel.python.algorithms import dqn
 
+# Temporarily disable TF2 behavior until we update the code.
+tf.disable_v2_behavior()
+
 MEM_KEY_NAME = "embedding"
 
 ValueBufferElement = collections.namedtuple("ValueElement", "embedding value")
@@ -154,8 +157,9 @@ def __init__(self,
         shape=[None, self._info_state_size],
         dtype=tf.float32,
         name="info_state_ph")
-    self._embedding_network = snt.nets.MLP(
-        list(embedding_network_layers) + [embedding_size])
+    self._embedding_network = simple_nets.MLP(self._info_state_size,
+                                              list(embedding_network_layers),
+                                              embedding_size)
     self._embedding = self._embedding_network(self._info_state_ph)
 
     # The DQN agent requires this be an integer.

diff --git a/open_spiel/python/algorithms/eva_test.py b/open_spiel/python/algorithms/eva_test.py
@@ -24,6 +24,9 @@
 from open_spiel.python import rl_environment
 from open_spiel.python.algorithms import eva
 
+# Temporarily disable TF2 behavior until we update the code.
+tf.disable_v2_behavior()
+
 
 class EVATest(parameterized.TestCase):
 

diff --git a/open_spiel/python/algorithms/exploitability_descent.py b/open_spiel/python/algorithms/exploitability_descent.py
@@ -49,6 +49,9 @@
 from open_spiel.python.algorithms import action_value_vs_best_response
 from open_spiel.python.algorithms import masked_softmax
 
+# Temporary disabling of v2 behavior until code is updated.
+tf.disable_v2_behavior()
+
 _NUM_PLAYERS = 2
 
 

diff --git a/open_spiel/python/algorithms/exploitability_descent_test.py b/open_spiel/python/algorithms/exploitability_descent_test.py
@@ -24,6 +24,9 @@
 from open_spiel.python.algorithms import exploitability_descent
 import pyspiel
 
+# Temporary disabling of v2 behavior until code is updated.
+tf.disable_v2_behavior()
+
 
 class ExploitabilityDescentTest(tf.test.TestCase):
 

diff --git a/open_spiel/python/algorithms/losses/rl_losses.py b/open_spiel/python/algorithms/losses/rl_losses.py
@@ -31,6 +31,9 @@
 
 import tensorflow.compat.v1 as tf
 
+# Temporarily disable v2 behavior until code is updated.
+tf.disable_v2_behavior()
+
 
 def _assert_rank_and_shape_compatibility(tensors, rank):
   if not tensors:

diff --git a/open_spiel/python/algorithms/losses/rl_losses_test.py b/open_spiel/python/algorithms/losses/rl_losses_test.py
@@ -24,6 +24,9 @@
 
 from open_spiel.python.algorithms.losses import rl_losses
 
+# Temporarily disable v2 behavior until code is updated.
+tf.disable_v2_behavior()
+
 
 class RLLossesTest(parameterized.TestCase, tf.test.TestCase):
 

diff --git a/open_spiel/python/algorithms/masked_softmax.py b/open_spiel/python/algorithms/masked_softmax.py
@@ -22,6 +22,9 @@
 import numpy as np
 import tensorflow.compat.v1 as tf
 
+# Temporarily disable TF2 behavior until the code is updated.
+tf.disable_v2_behavior()
+
 
 def tf_masked_softmax(logits, legal_actions_mask):
   """Returns the softmax over the valid actions defined by `legal_actions_mask`.

diff --git a/open_spiel/python/algorithms/masked_softmax_test.py b/open_spiel/python/algorithms/masked_softmax_test.py
@@ -28,6 +28,9 @@
 
 from open_spiel.python.algorithms import masked_softmax
 
+# Temporarily disable TF2 behavior until the code is updated.
+tf.disable_v2_behavior()
+
 
 exp = math.exp  # For shorter lines
 

diff --git a/open_spiel/python/algorithms/neurd.py b/open_spiel/python/algorithms/neurd.py
@@ -34,6 +34,9 @@
 
 from open_spiel.python.algorithms import rcfr
 
+# Temporarily disable TF2 behavior while the code is not updated.
+tf.disable_v2_behavior()
+
 
 def thresholded(logits, regrets, threshold=2.0):
   """Zeros out `regrets` where `logits` are too negative or too large."""