train_level10_eval_mini_srcgame_add_map_bn.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

USED_DEVICES = "4,5"
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = USED_DEVICES

import sys
import threading
import time

import tensorflow as tf
from absl import app
from absl import flags
from pysc2 import maps
from pysc2.lib import stopwatch

import lib.config as C
import param as P
import mini_source_agent_add_map_bn as mini_source_agent
from mini_network_add_map_bn import MiniNetwork


# from pysc2.env import sc2_env
from lib import my_sc2_env as sc2_env
from lib.replay_buffer import Buffer

from strategy.terran_agent import DummyTerran
from strategy_env import SimulatePlatform

import unit.protoss_unit as P
import unit.terran_unit as T

from datetime import datetime
import multiprocessing as mp
import numpy as np
from logging import warning as logging

FLAGS = flags.FLAGS
flags.DEFINE_bool("training", True, "Whether to train agents.")
flags.DEFINE_bool("on_server", True, "Whether is running on server.")
flags.DEFINE_bool("debug_mode", False, "Whether is debuging")
flags.DEFINE_integer("num_for_update", 1000, "Number of episodes for each train.")
flags.DEFINE_string("log_path", "./logs/", "Path for log.")
flags.DEFINE_string("device", USED_DEVICES, "Device for training.")

# Simple64
flags.DEFINE_string("map", "Simple64", "Name of a map to use.")
flags.DEFINE_bool("render", False, "Whether to render with pygame.")
flags.DEFINE_integer("screen_resolution", 64, "Resolution for screen feature layers.")
flags.DEFINE_integer("minimap_resolution", 64, "Resolution for minimap feature layers.")

flags.DEFINE_enum("agent_race", "P", sc2_env.races.keys(), "Agent's race.")
flags.DEFINE_enum("bot_race", "T", sc2_env.races.keys(), "Bot's race.")
flags.DEFINE_enum("difficulty", "A", sc2_env.difficulties.keys(), "Bot's strength, default is 7")

flags.DEFINE_integer("max_agent_steps", 18000, "Total agent steps.")
flags.DEFINE_integer("step_mul", 8, "Game steps per agent step.")

flags.DEFINE_bool("profile", False, "Whether to turn on code profiling.")
flags.DEFINE_bool("trace", False, "Whether to trace the code execution.")
flags.DEFINE_bool("save_replay", False, "Whether to replays_save a replay at the end.")
flags.DEFINE_string("replay_dir", "multi-agent/", "dir of replay to replays_save.")

# 20200901-213813_mini
# 20200828-160609_source
flags.DEFINE_string("restore_model_path", "./model/20200915-235932_source/", "path for restore model")
flags.DEFINE_bool("restore_model", True, "Whether to restore old model")
flags.DEFINE_string("restore_from", "source", "mini (for Thought-Game) or source (for Real game)")
flags.DEFINE_string("restore_to", "source", "mini (for Thought-Game) or source (for Real game)")
flags.DEFINE_bool("load_latest", False, "Load latest or bestest model, default is False")

flags.DEFINE_integer("parallel", 10, "How many processes to run in parallel.")
flags.DEFINE_integer("thread_num", 10, "How many thread to run in the process.")
flags.DEFINE_integer("port_num", 6770, "the start port to create distribute tf")
flags.DEFINE_integer("max_iters", 100, "the rl agent max run iters")

flags.DEFINE_string("game_version", None, "game version of SC2")

flags.DEFINE_bool("freeze_head", False, "Whether freeze_head train agents, default is False")
flags.DEFINE_bool("use_bn", False, "Whether use batch_norm to training, default is False")
flags.DEFINE_bool("use_sep_net", False, "Whether use seperate network for policy and value model, default is False")

flags.DEFINE_integer("ob_space_add", 0, "Add state space from thought game:0,4, default is 4, ")
flags.DEFINE_integer("act_space_add", 0, "Add action space from thought game:0,5, default is 5")
flags.DEFINE_bool("add_image", False, "Whether add image for input, default is True")
flags.DEFINE_bool("partial_restore", True, "Whether use partial_restore, default is False")
flags.DEFINE_string("weighted_sum_type", "AddWeight", "add weighted sum type: Add, AddWeight, AdaptiveWeight, AttentionWeight, default is AddWeight")
flags.DEFINE_string("initial_type", "original", "weight initial type: original, normal, xavier, he, zero, default is original")
FLAGS(sys.argv)

# set the play map
play_map = C.get_map_class('lib.config.' + FLAGS.map)

C.my_sub_pos = play_map.my_sub_pos
C.enemy_sub_pos = play_map.enemy_sub_pos
C.enemy_main_pos = play_map.enemy_main_pos
C.base_camera_pos = play_map.base_camera_pos

if not FLAGS.on_server or FLAGS.debug_mode:
    PARALLEL = 1
    THREAD_NUM = 1
    MAX_AGENT_STEPS = 18000
    DEVICE = ['/gpu:0']
    NUM_FOR_UPDATE = 1
    TRAIN_ITERS = 1
    PORT_NUM = FLAGS.port_num
else:
    PARALLEL = FLAGS.parallel
    THREAD_NUM = FLAGS.thread_num
    MAX_AGENT_STEPS = FLAGS.max_agent_steps
    if USED_DEVICES == '-1':
        DEVICE = ['/cpu:0']
    else:
        DEVICE = ['/gpu:' + str(dev) for dev in range(len(FLAGS.device.split(',')))] 
    NUM_FOR_UPDATE = FLAGS.num_for_update
    TRAIN_ITERS = FLAGS.max_iters
    PORT_NUM = FLAGS.port_num


LOG = FLAGS.log_path
if not os.path.exists(LOG):
    os.makedirs(LOG)

SERVER_DICT = {"worker": [], "ps": []}

# define some global variable
UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
Counter = 0
Waiting_Counter = 0
Update_Counter = 0
Result_List = []

''' 
ps -ef |grep liuruoze | grep 'SC2_x64' | awk '{print $2}' | xargs kill -9
kill -9 `ps -ef |grep liuruoze | grep eval_mini_srcgame_add_map_bn | awk '{print $2}' `

'''


def run_thread(agent, game_num, Synchronizer, difficulty):
    global UPDATE_EVENT, ROLLING_EVENT, Counter, Waiting_Counter, Update_Counter, Result_List

    num = 0
    all_num = 0
    proc_name = mp.current_process().name

    C._FPS = 22.4 / FLAGS.step_mul  # 5.6
    step_mul = FLAGS.step_mul  # 4
    if difficulty == 'A':
        C.difficulty = 10
    else:
        C.difficulty = difficulty
    with sc2_env.SC2Env(
            map_name=FLAGS.map,
            agent_race=FLAGS.agent_race,
            bot_race=FLAGS.bot_race,
            difficulty=difficulty,
            step_mul=step_mul,
            score_index=-1,
            game_steps_per_episode=MAX_AGENT_STEPS,
            screen_size_px=(FLAGS.screen_resolution, FLAGS.screen_resolution),
            minimap_size_px=(FLAGS.minimap_resolution, FLAGS.minimap_resolution),
            visualize=False,
            game_version=FLAGS.game_version) as env:
        # env = available_actions_printer.AvailableActionsPrinter(env)
        agent.set_env(env)

        while all_num != game_num * TRAIN_ITERS:
            agent.play_right_add(verbose=FLAGS.debug_mode)

            if FLAGS.training:
                # check if the num of episodes is enough to update
                num += 1
                all_num += 1
                reward = agent.result['reward']
                Counter += 1
                Result_List.append(reward)
                logging("(diff: %d) %d epoch: %s get %d/%d episodes! return: %d!" %
                        (int(C.difficulty), Update_Counter, proc_name, len(Result_List), game_num * THREAD_NUM, reward))

                # time for update
                if num == game_num:
                    num = 0
                    ROLLING_EVENT.clear()
                    # worker stops rolling, wait for update
                    if agent.index != 0 and THREAD_NUM > 1:
                        Waiting_Counter += 1
                        if Waiting_Counter == THREAD_NUM - 1:  # wait for all the workers stop
                            UPDATE_EVENT.set()
                        ROLLING_EVENT.wait()

                    # update!
                    else:
                        if THREAD_NUM > 1:
                            UPDATE_EVENT.wait()

                        Synchronizer.wait()  # wait for other processes to update
                        agent.update_result_list(Result_List)
                        Result_List.clear()
                        
                        Synchronizer.wait()
                        logging("Worker: Wait for model to save!")
                        # TODO count the time , compare cpu and gpu
                        time.sleep(1)
                        
                        Synchronizer.wait()
                        agent.update_policy()
                        agent.global_buffer.reset()

                        Synchronizer.wait()
                        Update_Counter += 1

                        # finish update
                        UPDATE_EVENT.clear()
                        Waiting_Counter = 0
                        ROLLING_EVENT.set()

            if FLAGS.save_replay:
                env.save_replay(FLAGS.replay_dir)

            agent.reset()


def Worker(index, update_game_num, Synchronizer, cluster, model_path, log_path):
    config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    worker = tf.train.Server(cluster, job_name="worker", task_index=index, config=config)
    sess = tf.Session(target=worker.target, config=config)
    summary_writer = tf.summary.FileWriter(log_path)
    Net = MiniNetwork(sess=sess, summary_writer=summary_writer, rl_training=FLAGS.training,
                      cluster=cluster, index=index, device=DEVICE[index % len(DEVICE)],
                      ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path, 
                      ob_space_add=FLAGS.ob_space_add, act_space_add=FLAGS.act_space_add, 
                      freeze_head=FLAGS.freeze_head, use_bn=FLAGS.use_bn, 
                      use_sep_net=FLAGS.use_sep_net, restore_model=FLAGS.restore_model,
                      restore_from=FLAGS.restore_from, restore_to=FLAGS.restore_to,
                      load_latest=FLAGS.load_latest, add_image=FLAGS.add_image, partial_restore=FLAGS.partial_restore,
                      weighted_sum_type=FLAGS.weighted_sum_type, initial_type=FLAGS.initial_type)

    global_buffer = Buffer()
    agents = []
    for i in range(THREAD_NUM):
        agent = mini_source_agent.MiniSourceAgent(index=i, global_buffer=global_buffer, net=Net,
                                                  restore_model=FLAGS.restore_model, rl_training=FLAGS.training,
                                                  strategy_agent=None, ob_space_add=FLAGS.ob_space_add)
        agents.append(agent)

    print("Worker %d: waiting for cluster connection..." % index)
    sess.run(tf.report_uninitialized_variables())
    print("Worker %d: cluster ready!" % index)

    while len(sess.run(tf.report_uninitialized_variables())):
        print("Worker %d: waiting for variable initialization..." % index)
        time.sleep(1)
    print("Worker %d: variables initialized" % index)

    game_num = np.ceil(update_game_num // THREAD_NUM)

    UPDATE_EVENT.clear()
    ROLLING_EVENT.set()

    # Run threads
    threads = []
    for i in range(THREAD_NUM - 1):
        t = threading.Thread(target=run_thread, args=(agents[i], game_num, Synchronizer, FLAGS.difficulty))
        threads.append(t)
        t.daemon = True
        t.start()
        time.sleep(3)

    run_thread(agents[-1], game_num, Synchronizer, FLAGS.difficulty)

    for t in threads:
        t.join()


def Parameter_Server(Synchronizer, cluster, log_path, model_path, procs):
    config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False,
    )
    config.gpu_options.allow_growth = True
    server = tf.train.Server(cluster, job_name="ps", task_index=0, config=config)
    sess = tf.Session(target=server.target, config=config)
    summary_writer = tf.summary.FileWriter(log_path)
    Net = MiniNetwork(sess=sess, summary_writer=summary_writer, rl_training=FLAGS.training,
                      cluster=cluster, index=0, device=DEVICE[0 % len(DEVICE)],
                      ppo_load_path=FLAGS.restore_model_path, ppo_save_path=model_path, 
                      ob_space_add=FLAGS.ob_space_add, act_space_add=FLAGS.act_space_add, 
                      freeze_head=FLAGS.freeze_head, use_bn=FLAGS.use_bn, 
                      use_sep_net=FLAGS.use_sep_net, 
                      restore_model=FLAGS.restore_model,
                      restore_from=FLAGS.restore_from, restore_to=FLAGS.restore_to,
                      load_latest=FLAGS.load_latest, add_image=FLAGS.add_image, partial_restore=FLAGS.partial_restore,
                      weighted_sum_type=FLAGS.weighted_sum_type, initial_type=FLAGS.initial_type)

    agent = mini_source_agent.MiniSourceAgent(index=-1, net=Net, restore_model=FLAGS.restore_model, 
                                                rl_training=FLAGS.training, ob_space_add=FLAGS.ob_space_add)

    print("Parameter server: waiting for cluster connection...")
    sess.run(tf.report_uninitialized_variables())
    print("Parameter server: cluster ready!")

    print("Parameter server: initializing variables...")
    agent.init_network()
    print("Parameter server: variables initialized")

    update_counter = 0
    max_win_rate = 0.
    latest_win_rate = 0.

    while update_counter < TRAIN_ITERS:
        agent.reset_old_network()

        # wait for update
        Synchronizer.wait()
        logging("PS: Wait for Update Result!")
        time.sleep(1)

        Synchronizer.wait()
        logging("PS: Update Summary!")
        steps, win_rate = agent.update_summary(update_counter)
        logging("PS: Steps: %d, win rate: %f" % (steps, win_rate))
        
        if win_rate >= max_win_rate:
            agent.save_model()
            logging("PS: Save best model!")
            max_win_rate = win_rate

        latest_win_rate = win_rate
        agent.net.save_latest_policy()

        Synchronizer.wait()
        logging("PS: Wait for Update Network!")
        # TODO count the time , compare cpu and gpu
        time.sleep(1)
        # update finish
        Synchronizer.wait()
        logging("PS: Update Network finished!")
        update_counter += 1

    return max_win_rate, latest_win_rate


def _main(unused_argv):
    # create distribute tf cluster
    start_port = PORT_NUM
    SERVER_DICT["ps"].append("localhost:%d" % start_port)
    for i in range(PARALLEL):
        SERVER_DICT["worker"].append("localhost:%d" % (start_port + 1 + i))

    Cluster = tf.train.ClusterSpec(SERVER_DICT)

    now = datetime.now()
    model_path = "./model/" + now.strftime("%Y%m%d-%H%M%S") + "_source/"
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    log_path = "./logs/" + now.strftime("%Y%m%d-%H%M%S") + "_source/"

    UPDATE_GAME_NUM = NUM_FOR_UPDATE
    per_update_num = np.ceil(UPDATE_GAME_NUM / PARALLEL)

    Synchronizer = mp.Barrier(PARALLEL + 1)
    # Run parallel process

    procs = []
    for index in range(PARALLEL):
        p = mp.Process(name="Worker_%d" % index, target=Worker, args=(index, per_update_num, Synchronizer, Cluster, model_path, log_path))
        procs.append(p)
        p.daemon = True
        p.start()
        time.sleep(1)

    max_win_rate, latest_win_rate = Parameter_Server(Synchronizer, Cluster, log_path, model_path, procs)
    print('#######################')
    print('Best Win_rate:', max_win_rate)
    print('Latest Win_rate:', latest_win_rate)
    print('#######################')

    for p in procs:
        p.join()

    '''
    if FLAGS.profile:
        print(stopwatch.sw)
        '''


if __name__ == "__main__":
    app.run(_main)