Skip to content

Commit

Permalink
Merge pull request #249 from lanctot:ubuntu_2004
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 318510165
Change-Id: Iff82542b05242b1aec2f187cf73019be517b6bdf
  • Loading branch information
open_spiel@google.com authored and open_spiel@google.com committed Jun 26, 2020
2 parents 404cf00 + c3db370 commit b65db20
Show file tree
Hide file tree
Showing 23 changed files with 297 additions and 78 deletions.
62 changes: 32 additions & 30 deletions open_spiel/python/algorithms/deep_cfr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
The algorithm defines an `advantage` and `strategy` networks that compute
advantages used to do regret matching across information sets and to approximate
the strategy profiles of the game. To train these networks a fixed ring buffer
the strategy profiles of the game. To train these networks a fixed ring buffer
(other data structures may be used) memory is used to accumulate samples to
train the networks.
"""
Expand All @@ -30,12 +30,14 @@
import collections
import random
import numpy as np
import sonnet as snt
import tensorflow.compat.v1 as tf

from open_spiel.python import policy
from open_spiel.python import simple_nets
import pyspiel

# Temporarily Disable TF2 behavior until we update the code.
tf.disable_v2_behavior()

AdvantageMemory = collections.namedtuple(
"AdvantageMemory", "info_state iteration advantage action")
Expand All @@ -48,10 +50,8 @@
class FixedSizeRingBuffer(object):
"""ReplayBuffer of fixed size with a FIFO replacement policy.
Stored transitions can be sampled uniformly.
The underlying datastructure is a ring buffer, allowing 0(1) adding and
sampling.
Stored transitions can be sampled uniformly. The underlying datastructure is a
ring buffer, allowing 0(1) adding and sampling.
"""

def __init__(self, replay_buffer_capacity):
Expand All @@ -63,7 +63,6 @@ def add(self, element):
"""Adds `element` to the buffer.
If the buffer is full, the oldest element will be replaced.
Args:
element: data to be added to the buffer.
"""
Expand All @@ -79,10 +78,8 @@ def sample(self, num_samples):
Args:
num_samples: `int`, number of samples to draw.
Returns:
An iterable over `num_samples` random elements of the buffer.
Raises:
ValueError: If there are less than `num_samples` elements in the buffer
"""
Expand Down Expand Up @@ -121,24 +118,25 @@ def __init__(self,
game,
policy_network_layers=(256, 256),
advantage_network_layers=(128, 128),
num_iterations=100,
num_traversals=20,
learning_rate=1e-4,
num_iterations: int = 100,
num_traversals: int = 20,
learning_rate: float = 1e-4,
batch_size_advantage=None,
batch_size_strategy=None,
memory_capacity=int(1e6),
policy_network_train_steps=1,
advantage_network_train_steps=1):
memory_capacity: int = int(1e6),
policy_network_train_steps: int = 1,
advantage_network_train_steps: int = 1,
reinitialize_advantage_networks: bool = True):
"""Initialize the Deep CFR algorithm.
Args:
session: (tf.Session) TensorFlow session.
game: Open Spiel game.
policy_network_layers: (list[int]) Layer sizes of strategy net MLP.
advantage_network_layers: (list[int]) Layer sizes of advantage net MLP.
num_iterations: (int) Number of iterations.
num_traversals: (int) Number of traversals per iteration.
learning_rate: (float) Learning rate.
num_iterations: Number of iterations.
num_traversals: Number of traversals per iteration.
learning_rate: Learning rate.
batch_size_advantage: (int or None) Batch size to sample from advantage
memories.
batch_size_strategy: (int or None) Batch size to sample from strategy
Expand All @@ -148,6 +146,8 @@ def __init__(self,
iteration).
advantage_network_train_steps: Number of advantage network training steps
(per iteration).
reinitialize_advantage_networks: Whether to re-initialize the
advantage network before training on each iteration.
"""
all_players = list(range(game.num_players()))
super(DeepCFRSolver, self).__init__(game, all_players)
Expand All @@ -166,6 +166,7 @@ def __init__(self,
self._embedding_size = len(self._root_node.information_state_tensor(0))
self._num_iterations = num_iterations
self._num_traversals = num_traversals
self._reinitialize_advantage_networks = reinitialize_advantage_networks
self._num_actions = game.num_distinct_actions()
self._iteration = 1

Expand Down Expand Up @@ -194,8 +195,9 @@ def __init__(self,

# Define strategy network, loss & memory.
self._strategy_memories = FixedSizeRingBuffer(memory_capacity)
self._policy_network = snt.nets.MLP(
list(policy_network_layers) + [self._num_actions])
self._policy_network = simple_nets.MLP(self._embedding_size,
list(policy_network_layers),
self._num_actions)
action_logits = self._policy_network(self._info_state_ph)
# Illegal actions are handled in the traversal code where expected payoff
# and sampled regret is computed from the advantage networks.
Expand All @@ -212,8 +214,8 @@ def __init__(self,
FixedSizeRingBuffer(memory_capacity) for _ in range(self._num_players)
]
self._advantage_networks = [
snt.nets.MLP(list(advantage_network_layers) + [self._num_actions])
for _ in range(self._num_players)
simple_nets.MLP(self._embedding_size, list(advantage_network_layers),
self._num_actions) for _ in range(self._num_players)
]
self._advantage_outputs = [
self._advantage_networks[i](self._info_state_ph)
Expand Down Expand Up @@ -251,8 +253,11 @@ def reinitialize_advantage_networks(self):
self.reinitialize_advantage_network(p)

def reinitialize_advantage_network(self, player):
for key in self._advantage_networks[player].initializers:
self._advantage_networks[player].initializers[key]()
self._session.run(
tf.group(*[
var.initializer
for var in self._advantage_networks[player].variables
]))

def solve(self):
"""Solution logic for Deep CFR."""
Expand All @@ -261,8 +266,9 @@ def solve(self):
for p in range(self._num_players):
for _ in range(self._num_traversals):
self._traverse_game_tree(self._root_node, p)
self.reinitialize_advantage_network(p)
# Re-initialize advantage network for player and train from scratch.
if self._reinitialize_advantage_networks:
# Re-initialize advantage network for player and train from scratch.
self.reinitialize_advantage_network(p)
advantage_losses[p].append(self._learn_advantage_network(p))
self._iteration += 1
# Train policy network.
Expand All @@ -274,11 +280,9 @@ def _traverse_game_tree(self, state, player):
Over a traversal the advantage and strategy memories are populated with
computed advantage values and matched regrets respectively.
Args:
state: Current OpenSpiel game state.
player: (int) Player index for this traversal.
Returns:
Recursively returns expected payoffs for each action.
"""
Expand Down Expand Up @@ -327,7 +331,6 @@ def _sample_action_from_advantage(self, state, player):
Args:
state: Current OpenSpiel game state.
player: (int) Player index over which to compute regrets.
Returns:
1. (list) Advantage values for info state actions indexed by action.
2. (list) Matched regrets, prob for actions indexed by action.
Expand Down Expand Up @@ -366,7 +369,6 @@ def _learn_advantage_network(self, player):
Args:
player: (int) player index.
Returns:
The average loss over the advantage network.
"""
Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/deep_cfr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
from open_spiel.python.algorithms import exploitability
import pyspiel

# Temporarily disable TF2 behavior until we update the code.
tf.disable_v2_behavior()


class DeepCFRTest(parameterized.TestCase):

Expand Down
32 changes: 21 additions & 11 deletions open_spiel/python/algorithms/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@
import collections
import random
import numpy as np
import sonnet as snt
import tensorflow.compat.v1 as tf

from open_spiel.python import rl_agent
from open_spiel.python import simple_nets

# Temporarily disable TF2 behavior until code is updated.
tf.disable_v2_behavior()

Transition = collections.namedtuple(
"Transition",
Expand Down Expand Up @@ -122,7 +125,7 @@ def __init__(self,
self._num_actions = num_actions
if isinstance(hidden_layers_sizes, int):
hidden_layers_sizes = [hidden_layers_sizes]
self._layer_sizes = hidden_layers_sizes + [num_actions]
self._layer_sizes = hidden_layers_sizes
self._batch_size = batch_size
self._update_target_network_every = update_target_network_every
self._learn_every = learn_every
Expand Down Expand Up @@ -166,9 +169,12 @@ def __init__(self,
dtype=tf.float32,
name="legal_actions_mask_ph")

self._q_network = snt.nets.MLP(output_sizes=self._layer_sizes)
self._q_network = simple_nets.MLP(state_representation_size,
self._layer_sizes, num_actions)
self._q_values = self._q_network(self._info_state_ph)
self._target_q_network = snt.nets.MLP(output_sizes=self._layer_sizes)

self._target_q_network = simple_nets.MLP(state_representation_size,
self._layer_sizes, num_actions)
self._target_q_values = self._target_q_network(self._next_info_state_ph)

# Stop gradient to prevent updates to the target network while learning
Expand Down Expand Up @@ -291,17 +297,21 @@ def _create_target_network_update_op(self, q_network, target_q_network):
"""Create TF ops copying the params of the Q-network to the target network.
Args:
q_network: `snt.AbstractModule`. Values are copied from this network.
target_q_network: `snt.AbstractModule`. Values are copied to this network.
q_network: A q-network object that implements provides the `variables`
property representing the TF variable list.
target_q_network: A target q-net object that provides the `variables`
property representing the TF variable list.
Returns:
A `tf.Operation` that updates the variables of the target.
"""
variables = q_network.get_variables()
target_variables = target_q_network.get_variables()
self._variables = q_network.variables[:]
self._target_variables = target_q_network.variables[:]
assert self._variables
assert len(self._variables) == len(self._target_variables)
return tf.group([
tf.assign(target_v, v)
for (target_v, v) in zip(target_variables, variables)
for (target_v, v) in zip(self._target_variables, self._variables)
])

def _epsilon_greedy(self, info_state, legal_actions, epsilon):
Expand Down Expand Up @@ -403,9 +413,9 @@ def step_counter(self):

def _initialize(self):
initialization_weights = tf.group(
*[var.initializer for var in self._q_network.variables])
*[var.initializer for var in self._variables])
initialization_target_weights = tf.group(
*[var.initializer for var in self._target_q_network.variables])
*[var.initializer for var in self._target_variables])
initialization_opt = tf.group(
*[var.initializer for var in self._optimizer.variables()])

Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/dqn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from open_spiel.python.algorithms import dqn
import pyspiel

# Temporarily disable TF2 behavior until code is updated.
tf.disable_v2_behavior()


class DQNTest(tf.test.TestCase):

Expand Down
10 changes: 7 additions & 3 deletions open_spiel/python/algorithms/eva.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,15 @@
import collections
import copy
import numpy as np
import sonnet as snt
import tensorflow.compat.v1 as tf

from open_spiel.python import rl_agent
from open_spiel.python import simple_nets
from open_spiel.python.algorithms import dqn

# Temporarily disable TF2 behavior until we update the code.
tf.disable_v2_behavior()

MEM_KEY_NAME = "embedding"

ValueBufferElement = collections.namedtuple("ValueElement", "embedding value")
Expand Down Expand Up @@ -154,8 +157,9 @@ def __init__(self,
shape=[None, self._info_state_size],
dtype=tf.float32,
name="info_state_ph")
self._embedding_network = snt.nets.MLP(
list(embedding_network_layers) + [embedding_size])
self._embedding_network = simple_nets.MLP(self._info_state_size,
list(embedding_network_layers),
embedding_size)
self._embedding = self._embedding_network(self._info_state_ph)

# The DQN agent requires this be an integer.
Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/eva_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from open_spiel.python import rl_environment
from open_spiel.python.algorithms import eva

# Temporarily disable TF2 behavior until we update the code.
tf.disable_v2_behavior()


class EVATest(parameterized.TestCase):

Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/exploitability_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
from open_spiel.python.algorithms import action_value_vs_best_response
from open_spiel.python.algorithms import masked_softmax

# Temporary disabling of v2 behavior until code is updated.
tf.disable_v2_behavior()

_NUM_PLAYERS = 2


Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/exploitability_descent_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from open_spiel.python.algorithms import exploitability_descent
import pyspiel

# Temporary disabling of v2 behavior until code is updated.
tf.disable_v2_behavior()


class ExploitabilityDescentTest(tf.test.TestCase):

Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/losses/rl_losses.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@

import tensorflow.compat.v1 as tf

# Temporarily disable v2 behavior until code is updated.
tf.disable_v2_behavior()


def _assert_rank_and_shape_compatibility(tensors, rank):
if not tensors:
Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/losses/rl_losses_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

from open_spiel.python.algorithms.losses import rl_losses

# Temporarily disable v2 behavior until code is updated.
tf.disable_v2_behavior()


class RLLossesTest(parameterized.TestCase, tf.test.TestCase):

Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/masked_softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
import numpy as np
import tensorflow.compat.v1 as tf

# Temporarily disable TF2 behavior until the code is updated.
tf.disable_v2_behavior()


def tf_masked_softmax(logits, legal_actions_mask):
"""Returns the softmax over the valid actions defined by `legal_actions_mask`.
Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/masked_softmax_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@

from open_spiel.python.algorithms import masked_softmax

# Temporarily disable TF2 behavior until the code is updated.
tf.disable_v2_behavior()


exp = math.exp # For shorter lines

Expand Down
3 changes: 3 additions & 0 deletions open_spiel/python/algorithms/neurd.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@

from open_spiel.python.algorithms import rcfr

# Temporarily disable TF2 behavior while the code is not updated.
tf.disable_v2_behavior()


def thresholded(logits, regrets, threshold=2.0):
"""Zeros out `regrets` where `logits` are too negative or too large."""
Expand Down
Loading

0 comments on commit b65db20

Please sign in to comment.