From 61d6dc57e21c1f42b397eb975dfb2bf12d59f100 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Wed, 24 Feb 2021 21:20:19 -0500 Subject: [PATCH 01/11] Make train_zerglings work with Tensorflow 1.14 --- common/common.py | 8 ++++---- common/vec_env/subproc_vec_env.py | 1 + defeat_zerglings/dqfd.py | 14 +++++++------- requirements.txt | 3 ++- train_defeat_zerglings.py | 7 +++++-- train_mineral_shards.py | 2 +- 6 files changed, 20 insertions(+), 15 deletions(-) diff --git a/common/common.py b/common/common.py index 82934ee..c5c0669 100644 --- a/common/common.py +++ b/common/common.py @@ -442,19 +442,19 @@ def shift(direction, number, matrix): in the specified (UP, DOWN, LEFT, RIGHT) direction and return it ''' if direction in (UP): - matrix = np.roll(matrix, -number, axis=0) + matrix = np.roll(matrix.__array__(), -number, axis=0) matrix[number:, :] = -2 return matrix elif direction in (DOWN): - matrix = np.roll(matrix, number, axis=0) + matrix = np.roll(matrix.__array__(), number, axis=0) matrix[:number, :] = -2 return matrix elif direction in (LEFT): - matrix = np.roll(matrix, -number, axis=1) + matrix = np.roll(matrix.__array__(), -number, axis=1) matrix[:, number:] = -2 return matrix elif direction in (RIGHT): - matrix = np.roll(matrix, number, axis=1) + matrix = np.roll(matrix.__array__(), number, axis=1) matrix[:, :number] = -2 return matrix else: diff --git a/common/vec_env/subproc_vec_env.py b/common/vec_env/subproc_vec_env.py index 2952ba0..6cddac0 100644 --- a/common/vec_env/subproc_vec_env.py +++ b/common/vec_env/subproc_vec_env.py @@ -21,6 +21,7 @@ def worker(remote, map_name, nscripts, i): ) with sc2_env.SC2Env( + players=[sc2_env.Agent(sc2_env.Race.terran)], agent_interface_format=[agent_format], map_name=map_name, step_mul=2) as env: diff --git a/defeat_zerglings/dqfd.py b/defeat_zerglings/dqfd.py index aa5707a..ae86e8e 100644 --- a/defeat_zerglings/dqfd.py +++ b/defeat_zerglings/dqfd.py @@ -6,7 +6,7 @@ import zipfile from absl import flags - +from baselines_legacy import cnn_to_mlp, BatchInput import baselines.common.tf_util as U from baselines import logger @@ -209,7 +209,7 @@ def learn(env, sess.__enter__() def make_obs_ph(name): - return U.BatchInput((64, 64), name=name) + return BatchInput((1, 32, 32), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, @@ -253,7 +253,7 @@ def make_obs_ph(name): obs = env.reset() # Select all marines first - player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] screen = player_relative @@ -296,7 +296,7 @@ def make_obs_ph(name): obs, screen, player = common.select_marine(env, obs) action = act( - np.array(screen)[None], update_eps=update_eps, **kwargs)[0] + np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] reset = False rew = 0 @@ -315,14 +315,14 @@ def make_obs_ph(name): #print(e) 1 # Do nothing - player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST - selected = obs[0].observation["screen"][_SELECTED] + selected = obs[0].observation["feature_screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): @@ -351,7 +351,7 @@ def make_obs_ph(name): if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() - player_relative = obs[0].observation["screen"][ + player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] screen = player_relative diff --git a/requirements.txt b/requirements.txt index fcbee81..029d0f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ git+https://github.com/openai/baselines numpy tensorflow absl-py -cloudpickle \ No newline at end of file +cloudpickle +dill \ No newline at end of file diff --git a/train_defeat_zerglings.py b/train_defeat_zerglings.py index 0fba7e3..bc3d8f7 100644 --- a/train_defeat_zerglings.py +++ b/train_defeat_zerglings.py @@ -7,6 +7,7 @@ from pysc2.env import sc2_env from pysc2.lib import actions from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat +from baselines_legacy import cnn_to_mlp from defeat_zerglings import dqfd @@ -75,9 +76,11 @@ def main(): map_name="DefeatZerglingsAndBanelings", step_mul=step_mul, visualize=True, - game_steps_per_episode=steps * step_mul) as env: + game_steps_per_episode=steps * step_mul, + agent_interface_format=sc2_env.AgentInterfaceFormat( + feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))) as env: - model = deepq.models.cnn_to_mlp( + model = cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True diff --git a/train_mineral_shards.py b/train_mineral_shards.py index 6cd775d..3661718 100644 --- a/train_mineral_shards.py +++ b/train_mineral_shards.py @@ -6,7 +6,7 @@ from pysc2.env import sc2_env from pysc2.lib import actions -from baselines_legacy import cnn_to_mlp +from baselines_legacy import cnn_to_mlp, BatchInput from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat from common.vec_env.subproc_vec_env import SubprocVecEnv From 9a704d8ac14de9efa4396c6dcaf39cd116c3846d Mon Sep 17 00:00:00 2001 From: rwill128 Date: Wed, 24 Feb 2021 22:30:32 -0500 Subject: [PATCH 02/11] Reformatting some files, and also fixing action assignments so that they fit the 32*32 action space. --- a2c/a2c.py | 12 +- common/common.py | 1110 ++++++++++++++--------------- common/vec_env/subproc_vec_env.py | 12 +- defeat_zerglings/dqfd.py | 658 +++++++++-------- train_defeat_zerglings.py | 3 +- 5 files changed, 890 insertions(+), 905 deletions(-) diff --git a/a2c/a2c.py b/a2c/a2c.py index b11d814..2739292 100644 --- a/a2c/a2c.py +++ b/a2c/a2c.py @@ -22,7 +22,7 @@ _CONTROL_GROUP_RECALL = 0 _NOT_QUEUED = 0 -# np.set_printoptions(threshold=np.inf) +np.set_printoptions(threshold=np.inf) def mse(pred, target): return tf.square(pred-target)/2. @@ -300,12 +300,12 @@ def __init__(self, self.group_id = [0 for _ in range(nenv)] def update_obs(self, obs): # (self.nenv, 32, 32, 2) - #obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) + obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) self.obs = np.roll(self.obs, shift=-3, axis=3) new_map = np.zeros((self.nenv, 32, 32, 3)) new_map[:, :, :, -1] = obs[:, 0, :, :] for env_num in range(self.nenv): - # print("xy_per_marine: ", self.xy_per_marine) + print("xy_per_marine: ", self.xy_per_marine) if "0" not in self.xy_per_marine[env_num]: self.xy_per_marine[env_num]["0"] = [0, 0] if "1" not in self.xy_per_marine[env_num]: @@ -319,14 +319,14 @@ def update_obs(self, obs): # (self.nenv, 32, 32, 2) # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32) def update_available(self, _available_actions): - #print("update_available : ", _available_actions) + print("update_available : ", _available_actions) self.available_actions = _available_actions # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]]) self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8) for env_num, list in enumerate(_available_actions): - # print("env_num :", env_num, " list :", list) + print("env_num :", env_num, " list :", list) for action_num in list: - # print("action_num :", action_num) + print("action_num :", action_num) if (action_num == 4): self.base_act_mask[env_num][0] = 1 self.base_act_mask[env_num][1] = 1 diff --git a/common/common.py b/common/common.py index c5c0669..73dbd9f 100644 --- a/common/common.py +++ b/common/common.py @@ -32,653 +32,647 @@ def init(env, obs): - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - # print("init") - army_count = env._obs[0].observation.player_common.army_count - - player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() - - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - - # if(army_count==0): - # return obs - # try: - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) - # - # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [[_SELECT_ALL]])]) - # except Exception as e: - # print(e) - # for i in range(len(player_x)): - # if i % 4 != 0: - # continue - # - # xy = [player_x[i], player_y[i]] - # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])]) - - group_id = 0 - group_list = [] - unit_xy_list = [] - last_xy = [0, 0] - xy_per_marine = {} - for i in range(len(player_x)): - - if group_id > 9: - break - - xy = [player_x[i], player_y[i]] - unit_xy_list.append(xy) - - if (len(unit_xy_list) >= 1): - for idx, xy in enumerate(unit_xy_list): - if (idx == 0): - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) - ]) - else: - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) - ]) - last_xy = xy - - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, - [[_CONTROL_GROUP_SET], [group_id]]) - ]) - unit_xy_list = [] - xy_per_marine[str(group_id)] = last_xy - - group_list.append(group_id) - group_id += 1 - - if len(unit_xy_list) >= 1: - for idx, xy in enumerate(unit_xy_list): - if idx == 0: - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) - ]) - else: + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + # print("init") + army_count = env._obs[0].observation.player_common.army_count + + player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() + + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + + # if(army_count==0): + # return obs + # try: + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) + # + # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [[_SELECT_ALL]])]) + # except Exception as e: + # print(e) + # for i in range(len(player_x)): + # if i % 4 != 0: + # continue + # + # xy = [player_x[i], player_y[i]] + # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])]) + + group_id = 0 + group_list = [] + unit_xy_list = [] + last_xy = [0, 0] + xy_per_marine = {} + for i in range(len(player_x)): + + if group_id > 9: + break + + xy = [player_x[i], player_y[i]] + unit_xy_list.append(xy) + + if len(unit_xy_list) >= 1: + for idx, xy in enumerate(unit_xy_list): + if idx == 0: + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) + ]) + else: + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) + ]) + last_xy = xy + + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, + [[_CONTROL_GROUP_SET], [group_id]]) + ]) + unit_xy_list = [] + xy_per_marine[str(group_id)] = last_xy + + group_list.append(group_id) + group_id += 1 + + if len(unit_xy_list) >= 1: + for idx, xy in enumerate(unit_xy_list): + if idx == 0: + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) + ]) + else: + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) + ]) + last_xy = xy + obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) + sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, + [[_CONTROL_GROUP_SET], [group_id]]) ]) - last_xy = xy + xy_per_marine[str(group_id)] = last_xy - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, - [[_CONTROL_GROUP_SET], [group_id]]) - ]) - xy_per_marine[str(group_id)] = last_xy + group_list.append(group_id) + group_id += 1 - group_list.append(group_id) - group_id += 1 - - return obs, xy_per_marine + return obs, xy_per_marine def solve_tsp( - player_relative, - selected, - group_list, - group_id, - dest_per_marine, - xy_per_marine): - - my_dest = None - other_dest = None - closest, min_dist = None, None - actions = [] - neutral_y, neutral_x = (player_relative == 1).nonzero() - player_y, player_x = (selected == 1).nonzero() - - #for group_id in group_list: - if "0" in dest_per_marine and "1" in dest_per_marine: - if group_id == 0: - my_dest = dest_per_marine["0"] - other_dest = dest_per_marine["1"] - else: - my_dest = dest_per_marine["1"] - other_dest = dest_per_marine["0"] + player_relative, + selected, + group_list, + group_id, + dest_per_marine, + xy_per_marine): + my_dest = None + other_dest = None + closest, min_dist = None, None + actions = [] + neutral_y, neutral_x = (player_relative == 1).nonzero() + player_y, player_x = (selected == 1).nonzero() + + # for group_id in group_list: + if "0" in dest_per_marine and "1" in dest_per_marine: + if group_id == 0: + my_dest = dest_per_marine["0"] + other_dest = dest_per_marine["1"] + else: + my_dest = dest_per_marine["1"] + other_dest = dest_per_marine["0"] - if len(player_x) > 0: - if group_id == 0: - xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())] - else: - xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())] - - player = xy_per_marine[str(group_id)] - points = [player] - - for p in zip(neutral_x, neutral_y): - - if other_dest: - dist = np.linalg.norm(np.array(other_dest) - np.array(p)) - if dist < 10: - # print("continue since partner will take care of it ", p) - continue - - pp = [p[0], p[1]] - if pp not in points: - points.append(pp) - - dist = np.linalg.norm(np.array(player) - np.array(p)) - if not min_dist or dist < min_dist: - closest, min_dist = p, dist - - solve_tsp = False - if my_dest: - dist = np.linalg.norm(np.array(player) - np.array(my_dest)) - if dist < 0.5: - solve_tsp = True - - if my_dest is None: - solve_tsp = True - - if len(points) < 2: - solve_tsp = False - - if solve_tsp: - # function for printing best found solution when it is found - from time import clock - init = clock() - - def report_sol(obj, s=""): - print("cpu:%g\tobj:%g\ttour:%s" % \ - (clock(), obj, s)) - - n, D = mk_matrix(points, distL2) - niter = 50 - tour, z = multistart_localsearch(niter, n, D) - - left, right = None, None - for idx in tour: - if tour[idx] == 0: - if idx == len(tour) - 1: - right = points[tour[0]] - left = points[tour[idx - 1]] - elif idx == 0: - right = points[tour[idx + 1]] - left = points[tour[len(tour) - 1]] - else: - right = points[tour[idx + 1]] - left = points[tour[idx - 1]] - - left_d = np.linalg.norm(np.array(player) - np.array(left)) - right_d = np.linalg.norm(np.array(player) - np.array(right)) - if right_d > left_d: - closest = left - else: - closest = right - - #print("optimal next :" , closest) - dest_per_marine[str(group_id)] = closest - #print("dest_per_marine", self.dest_per_marine) - #dest_per_marine {'0': [56, 26], '1': [52, 6]} - - if closest: - if group_id == 0: - actions.append({ - "base_action": group_id, - "x0": closest[0], - "y0": closest[1] - }) - else: - actions.append({ - "base_action": group_id, - "x1": closest[0], - "y1": closest[1] - }) + if len(player_x) > 0: + if group_id == 0: + xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())] + else: + xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())] + + player = xy_per_marine[str(group_id)] + points = [player] + + for p in zip(neutral_x, neutral_y): + + if other_dest: + dist = np.linalg.norm(np.array(other_dest) - np.array(p)) + if dist < 10: + # print("continue since partner will take care of it ", p) + continue + + pp = [p[0], p[1]] + if pp not in points: + points.append(pp) + + dist = np.linalg.norm(np.array(player) - np.array(p)) + if not min_dist or dist < min_dist: + closest, min_dist = p, dist + + solve_tsp = False + if my_dest: + dist = np.linalg.norm(np.array(player) - np.array(my_dest)) + if dist < 0.5: + solve_tsp = True + + if my_dest is None: + solve_tsp = True + + if len(points) < 2: + solve_tsp = False + + if solve_tsp: + # function for printing best found solution when it is found + from time import clock + init = clock() + + def report_sol(obj, s=""): + print("cpu:%g\tobj:%g\ttour:%s" % \ + (clock(), obj, s)) + + n, D = mk_matrix(points, distL2) + niter = 50 + tour, z = multistart_localsearch(niter, n, D) + + left, right = None, None + for idx in tour: + if tour[idx] == 0: + if idx == len(tour) - 1: + right = points[tour[0]] + left = points[tour[idx - 1]] + elif idx == 0: + right = points[tour[idx + 1]] + left = points[tour[len(tour) - 1]] + else: + right = points[tour[idx + 1]] + left = points[tour[idx - 1]] + + left_d = np.linalg.norm(np.array(player) - np.array(left)) + right_d = np.linalg.norm(np.array(player) - np.array(right)) + if right_d > left_d: + closest = left + else: + closest = right + + # print("optimal next :" , closest) + dest_per_marine[str(group_id)] = closest + # print("dest_per_marine", self.dest_per_marine) + # dest_per_marine {'0': [56, 26], '1': [52, 6]} + + if closest: + if group_id == 0: + actions.append({ + "base_action": group_id, + "x0": closest[0], + "y0": closest[1] + }) + else: + actions.append({ + "base_action": group_id, + "x1": closest[0], + "y1": closest[1] + }) + + elif my_dest: + if group_id == 0: + actions.append({ + "base_action": group_id, + "x0": my_dest[0], + "y0": my_dest[1] + }) + else: + actions.append({ + "base_action": group_id, + "x1": my_dest[0], + "y1": my_dest[1] + }) + else: + if group_id == 0: + actions.append({ + "base_action": 2, + "x0": 0, + "y0": 0 + }) + else: + actions.append({ + "base_action": 2, + "x1": 0, + "y1": 0 + }) + + # elif(len(group_list)>0): + # + # group_id = random.randint(0,len(group_list)-1) + # actions.append({"base_action":group_id}) - elif my_dest: - if group_id == 0: - actions.append({ - "base_action": group_id, - "x0": my_dest[0], - "y0": my_dest[1] - }) - else: - actions.append({ - "base_action": group_id, - "x1": my_dest[0], - "y1": my_dest[1] - }) + if group_id == 0: + group_id = 1 else: - if group_id == 0: - actions.append({ - "base_action": 2, - "x0": 0, - "y0": 0 - }) - else: - actions.append({ - "base_action": 2, - "x1": 0, - "y1": 0 - }) - - # elif(len(group_list)>0): - # - # group_id = random.randint(0,len(group_list)-1) - # actions.append({"base_action":group_id}) + group_id = 0 - if group_id == 0: - group_id = 1 - else: - group_id = 0 + if "0" not in xy_per_marine: + xy_per_marine["0"] = [0, 0] + if "1" not in xy_per_marine: + xy_per_marine["1"] = [0, 0] - if "0" not in xy_per_marine: - xy_per_marine["0"] = [0, 0] - if "1" not in xy_per_marine: - xy_per_marine["1"] = [0, 0] - - return actions, group_id, dest_per_marine, xy_per_marine + return actions, group_id, dest_per_marine, xy_per_marine def group_init_queue(player_relative): + actions = [] + + player_x, player_y = (player_relative == _PLAYER_FRIENDLY).nonzero() + # try: + # + # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() + # actions.append({"base_action":_SELECT_ARMY, "sub7":_SELECT_ALL}) + # + # except Exception as e: + # print(e) + # for i in range(len(player_x)): + # if i % 4 != 0: + # continue + # + # xy = [player_x[i], player_y[i]] + # actions.append({"base_action":_SELECT_POINT, "sub6":0, "x0":xy[0], "y0":xy[1]}) + + group_id = 0 + group_list = [] + unit_xy_list = [] + for i in range(len(player_x)): + + if group_id > 9: + break + + xy = [player_x[i], player_y[i]] + unit_xy_list.append(xy) + # 2/select_point (6/select_point_act [4]; 0/screen [84, 84]) + # 4/select_control_group (4/control_group_act [5]; 5/control_group_id [10]) + if len(unit_xy_list) >= 1: + for idx, xy in enumerate(unit_xy_list): + if idx == 0: + actions.append({ + "base_action": _SELECT_POINT, + "sub6": 0, + "x0": xy[0], + "y0": xy[1] + }) + else: + actions.append({ + "base_action": _SELECT_POINT, + "sub6": 1, + "x0": xy[0], + "y0": xy[1] + }) + + actions.append({ + "base_action": _SELECT_CONTROL_GROUP, + "sub4": _CONTROL_GROUP_SET, + "sub5": group_id + }) + unit_xy_list = [] + + group_list.append(group_id) + group_id += 1 + + if len(unit_xy_list) >= 1: + for idx, xy in enumerate(unit_xy_list): + if idx == 0: + actions.append({ + "base_action": _SELECT_POINT, + "sub6": 0, + "x0": xy[0], + "y0": xy[1] + }) + else: + actions.append({ + "base_action": _SELECT_POINT, + "sub6": 1, + "x0": xy[0], + "y0": xy[1] + }) - actions = [] - - player_x, player_y = (player_relative == _PLAYER_FRIENDLY).nonzero() - # try: - # - # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() - # actions.append({"base_action":_SELECT_ARMY, "sub7":_SELECT_ALL}) - # - # except Exception as e: - # print(e) - # for i in range(len(player_x)): - # if i % 4 != 0: - # continue - # - # xy = [player_x[i], player_y[i]] - # actions.append({"base_action":_SELECT_POINT, "sub6":0, "x0":xy[0], "y0":xy[1]}) - - group_id = 0 - group_list = [] - unit_xy_list = [] - for i in range(len(player_x)): - - if group_id > 9: - break - - xy = [player_x[i], player_y[i]] - unit_xy_list.append(xy) - # 2/select_point (6/select_point_act [4]; 0/screen [84, 84]) - # 4/select_control_group (4/control_group_act [5]; 5/control_group_id [10]) - if (len(unit_xy_list) >= 1): - for idx, xy in enumerate(unit_xy_list): - if (idx == 0): - actions.append({ - "base_action": _SELECT_POINT, - "sub6": 0, - "x0": xy[0], - "y0": xy[1] - }) - else: - actions.append({ - "base_action": _SELECT_POINT, - "sub6": 1, - "x0": xy[0], - "y0": xy[1] - }) - - actions.append({ - "base_action": _SELECT_CONTROL_GROUP, - "sub4": _CONTROL_GROUP_SET, - "sub5": group_id - }) - unit_xy_list = [] - - group_list.append(group_id) - group_id += 1 - - if (len(unit_xy_list) >= 1): - for idx, xy in enumerate(unit_xy_list): - if (idx == 0): - actions.append({ - "base_action": _SELECT_POINT, - "sub6": 0, - "x0": xy[0], - "y0": xy[1] - }) - else: actions.append({ - "base_action": _SELECT_POINT, - "sub6": 1, - "x0": xy[0], - "y0": xy[1] + "base_action": _SELECT_CONTROL_GROUP, + "sub4": _CONTROL_GROUP_SET, + "sub5": group_id }) - actions.append({ - "base_action": _SELECT_CONTROL_GROUP, - "sub4": _CONTROL_GROUP_SET, - "sub5": group_id - }) - - group_list.append(group_id) - group_id += 1 + group_list.append(group_id) + group_id += 1 - return actions + return actions def update_group_list2(control_group): + group_count = 0 + group_list = [] - group_count = 0 - group_list = [] + for control_group_id, data in enumerate(control_group): - for control_group_id, data in enumerate(control_group): + unit_id = data[0] + count = data[1] - unit_id = data[0] - count = data[1] + if unit_id != 0: + group_count += 1 + group_list.append(control_group_id) - if (unit_id != 0): - group_count += 1 - group_list.append(control_group_id) - - return group_list + return group_list def check_group_list2(extra): - army_count = 0 - # (64, 64, 3) - for control_group_id in range(10): - unit_id = extra[control_group_id, 1] - count = extra[control_group_id, 2] - if (unit_id != 0): - army_count += count + army_count = 0 + # (64, 64, 3) + for control_group_id in range(10): + unit_id = extra[control_group_id, 1] + count = extra[control_group_id, 2] + if unit_id != 0: + army_count += count - if (army_count != extra[0, 0]): - return True + if army_count != extra[0, 0]: + return True - return False + return False def update_group_list(obs): - control_groups = obs[0].observation["control_groups"] - group_count = 0 - group_list = [] - for id, group in enumerate(control_groups): - if (group[0] != 0): - group_count += 1 - group_list.append(id) - return group_list + control_groups = obs[0].observation["control_groups"] + group_count = 0 + group_list = [] + for id, group in enumerate(control_groups): + if group[0] != 0: + group_count += 1 + group_list.append(id) + return group_list def check_group_list(env, obs): - error = False - control_groups = obs[0].observation["control_groups"] - army_count = 0 - for id, group in enumerate(control_groups): - if (group[0] == 48): - army_count += group[1] - if (group[1] != 1): - #print("group error group_id : %s count : %s" % (id, group[1])) + error = False + control_groups = obs[0].observation["control_groups"] + army_count = 0 + for id, group in enumerate(control_groups): + if group[0] == 48: + army_count += group[1] + if group[1] != 1: + print("group error group_id : %s count : %s" % (id, group[1])) + error = True + return error + if army_count != env._obs[0].observation.player_common.army_count: error = True - return error - if (army_count != env._obs[0].observation.player_common.army_count): - error = True - # print("army_count %s != %s env._obs.observation.player_common.army_count " - # % (army_count, env._obs.observation.player_common.army_count)) + print("army_count %s != %s env._obs.observation.player_common.army_count " % (army_count, env._obs[0].observation.player_common.army_count)) - return error + return error UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right' def shift(direction, number, matrix): - ''' shift given 2D matrix in-place the given number of rows or columns - in the specified (UP, DOWN, LEFT, RIGHT) direction and return it -''' - if direction in (UP): - matrix = np.roll(matrix.__array__(), -number, axis=0) - matrix[number:, :] = -2 - return matrix - elif direction in (DOWN): - matrix = np.roll(matrix.__array__(), number, axis=0) - matrix[:number, :] = -2 - return matrix - elif direction in (LEFT): - matrix = np.roll(matrix.__array__(), -number, axis=1) - matrix[:, number:] = -2 - return matrix - elif direction in (RIGHT): - matrix = np.roll(matrix.__array__(), number, axis=1) - matrix[:, :number] = -2 - return matrix - else: - return matrix + ''' shift given 2D matrix in-place the given number of rows or columns + in the specified (UP, DOWN, LEFT, RIGHT) direction and return it + ''' + if direction in UP: + matrix = np.roll(matrix.__array__(), -number, axis=0) + matrix[number:, :] = -2 + return matrix + elif direction in DOWN: + matrix = np.roll(matrix.__array__(), number, axis=0) + matrix[:number, :] = -2 + return matrix + elif direction in LEFT: + matrix = np.roll(matrix.__array__(), -number, axis=1) + matrix[:, number:] = -2 + return matrix + elif direction in RIGHT: + matrix = np.roll(matrix.__array__(), number, axis=1) + matrix[:, :number] = -2 + return matrix + else: + return matrix def select_marine(env, obs): + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + screen = player_relative - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - screen = player_relative + group_list = update_group_list(obs) - group_list = update_group_list(obs) + if check_group_list(env, obs): + obs, xy_per_marine = init(env, obs) + group_list = update_group_list(obs) - if (check_group_list(env, obs)): - obs, xy_per_marine = init(env, obs) - group_list = update_group_list(obs) + # if(len(group_list) == 0): + # obs = init(env, player_relative, obs) + # group_list = update_group_list(obs) - # if(len(group_list) == 0): - # obs = init(env, player_relative, obs) - # group_list = update_group_list(obs) - - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - - friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero() - - enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero() - - player = [] - - danger_closest, danger_min_dist = None, None - for e in zip(enemy_x, enemy_y): - for p in zip(friendly_x, friendly_y): - dist = np.linalg.norm(np.array(p) - np.array(e)) - if not danger_min_dist or dist < danger_min_dist: - danger_closest, danger_min_dist = p, dist - - marine_closest, marine_min_dist = None, None - for e in zip(friendly_x, friendly_y): - for p in zip(friendly_x, friendly_y): - dist = np.linalg.norm(np.array(p) - np.array(e)) - if not marine_min_dist or dist < marine_min_dist: - if dist >= 2: - marine_closest, marine_min_dist = p, dist - - if (danger_min_dist != None and danger_min_dist <= 5): - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_POINT, [[0], danger_closest]) - ]) - - selected = obs[0].observation["feature_screen"][_SELECTED] - player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() - if (len(player_y) > 0): - player = [int(player_x.mean()), int(player_y.mean())] - - elif (marine_closest != None and marine_min_dist <= 3): - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_POINT, [[0], marine_closest]) - ]) - - selected = obs[0].observation["feature_screen"][_SELECTED] - player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() - if (len(player_y) > 0): - player = [int(player_x.mean()), int(player_y.mean())] - - else: - - # If there is no marine in danger, select random - while (len(group_list) > 0): - # units = env._obs.observation.raw_data.units - # marine_list = [] # for unit in units: - # if(unit.alliance == 1): - # marine_list.append(unit) - - group_id = np.random.choice(group_list) - #xy = [int(unit.pos.y - 10), int(unit.pos.x+8)] - #print("check xy : %s - %s" % (xy, player_relative[xy[0],xy[1]])) - obs = env.step(actions=[ - sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, [[ - _CONTROL_GROUP_RECALL - ], [int(group_id)]]) - ]) - - selected = obs[0].observation["feature_screen"][_SELECTED] - player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() - if (len(player_y) > 0): - player = [int(player_x.mean()), int(player_y.mean())] - break - else: - group_list.remove(group_id) - - if (len(player) == 2): - - if (player[0] > 32): - screen = shift(LEFT, player[0] - 32, screen) - elif (player[0] < 32): - screen = shift(RIGHT, 32 - player[0], screen) - - if (player[1] > 32): - screen = shift(UP, player[1] - 32, screen) - elif (player[1] < 32): - screen = shift(DOWN, 32 - player[1], screen) - - return obs, screen, player + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero() -def marine_action(env, obs, player, action): + enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero() + + player = [] + + danger_closest, danger_min_dist = None, None + for e in zip(enemy_x, enemy_y): + for p in zip(friendly_x, friendly_y): + dist = np.linalg.norm(np.array(p) - np.array(e)) + if not danger_min_dist or dist < danger_min_dist: + danger_closest, danger_min_dist = p, dist - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + marine_closest, marine_min_dist = None, None + for e in zip(friendly_x, friendly_y): + for p in zip(friendly_x, friendly_y): + dist = np.linalg.norm(np.array(p) - np.array(e)) + if not marine_min_dist or dist < marine_min_dist: + if dist >= 2: + marine_closest, marine_min_dist = p, dist + + if danger_min_dist is not None and danger_min_dist <= 5: + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_POINT, [[0], danger_closest]) + ]) + + selected = obs[0].observation["feature_screen"][_SELECTED] + player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() + if len(player_y) > 0: + player = [int(player_x.mean()), int(player_y.mean())] + + elif marine_closest is not None and marine_min_dist <= 3: + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_POINT, [[0], marine_closest]) + ]) + + selected = obs[0].observation["feature_screen"][_SELECTED] + player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() + if len(player_y) > 0: + player = [int(player_x.mean()), int(player_y.mean())] + + else: + + # If there is no marine in danger, select random + while len(group_list) > 0: + # units = env._obs.observation.raw_data.units + # marine_list = [] # for unit in units: + # if(unit.alliance == 1): + # marine_list.append(unit) + + group_id = np.random.choice(group_list) + # xy = [int(unit.pos.y - 10), int(unit.pos.x+8)] + # print("check xy : %s - %s" % (xy, player_relative[xy[0],xy[1]])) + obs = env.step(actions=[ + sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, [[ + _CONTROL_GROUP_RECALL + ], [int(group_id)]]) + ]) + + selected = obs[0].observation["feature_screen"][_SELECTED] + player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() + if len(player_y) > 0: + player = [int(player_x.mean()), int(player_y.mean())] + break + else: + group_list.remove(group_id) + + if len(player) == 2: + + if player[0] > 32: + screen = shift(LEFT, player[0] - 32, screen) + elif player[0] < 32: + screen = shift(RIGHT, 32 - player[0], screen) + + if player[1] > 32: + screen = shift(UP, player[1] - 32, screen) + elif player[1] < 32: + screen = shift(DOWN, 32 - player[1], screen) + + return obs, screen, player + + +def marine_action(env, obs, player, action): + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero() + enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero() - closest, min_dist = None, None + closest, min_dist = None, None - if (len(player) == 2): - for p in zip(enemy_x, enemy_y): - dist = np.linalg.norm(np.array(player) - np.array(p)) - if not min_dist or dist < min_dist: - closest, min_dist = p, dist + if len(player) == 2: + for p in zip(enemy_x, enemy_y): + dist = np.linalg.norm(np.array(player) - np.array(p)) + if not min_dist or dist < min_dist: + closest, min_dist = p, dist - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero() + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero() - closest_friend, min_dist_friend = None, None - if (len(player) == 2): - for p in zip(friendly_x, friendly_y): - dist = np.linalg.norm(np.array(player) - np.array(p)) - if not min_dist_friend or dist < min_dist_friend: - closest_friend, min_dist_friend = p, dist + closest_friend, min_dist_friend = None, None + if len(player) == 2: + for p in zip(friendly_x, friendly_y): + dist = np.linalg.norm(np.array(player) - np.array(p)) + if not min_dist_friend or dist < min_dist_friend: + closest_friend, min_dist_friend = p, dist - if (closest == None): + if closest is None: - new_action = [sc2_actions.FunctionCall(_NO_OP, [])] + new_action = [sc2_actions.FunctionCall(_NO_OP, [])] - elif (action == 0 and closest_friend != None and min_dist_friend < 3): - # Friendly marine is too close => Sparse! + elif action == 0 and closest_friend is not None and min_dist_friend < 3: + # Friendly marine is too close => Sparse! - mean_friend = [int(friendly_x.mean()), int(friendly_x.mean())] + mean_friend = [int(friendly_x.mean()), int(friendly_x.mean())] - diff = np.array(player) - np.array(closest_friend) + diff = np.array(player) - np.array(closest_friend) - norm = np.linalg.norm(diff) + norm = np.linalg.norm(diff) - if (norm != 0): - diff = diff / norm + if norm != 0: + diff = diff / norm - coord = np.array(player) + diff * 4 + coord = np.array(player) + diff * 4 - if (coord[0] < 0): - coord[0] = 0 - elif (coord[0] > 63): - coord[0] = 63 + if coord[0] < 0: + coord[0] = 0 + elif coord[0] > 31: + coord[0] = 31 - if (coord[1] < 0): - coord[1] = 0 - elif (coord[1] > 63): - coord[1] = 63 + if coord[1] < 0: + coord[1] = 0 + elif coord[1] > 31: + coord[1] = 31 - new_action = [ - sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) - ] + new_action = [ + sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) + ] - elif (action <= 1): #Attack + elif action <= 1: # Attack - # nearest enemy + # nearest enemy - coord = closest + coord = closest - new_action = [ - sc2_actions.FunctionCall(_ATTACK_SCREEN, [[_NOT_QUEUED], coord]) - ] + new_action = [ + sc2_actions.FunctionCall(_ATTACK_SCREEN, [[_NOT_QUEUED], coord]) + ] - #print("action : %s Attack Coord : %s" % (action, coord)) + # print("action : %s Attack Coord : %s" % (action, coord)) - elif (action == 2): # Oppsite direcion from enemy + elif action == 2: # Oppsite direcion from enemy - # nearest enemy opposite + # nearest enemy opposite - diff = np.array(player) - np.array(closest) + diff = np.array(player) - np.array(closest) - norm = np.linalg.norm(diff) + norm = np.linalg.norm(diff) - if (norm != 0): - diff = diff / norm + if norm != 0: + diff = diff / norm - coord = np.array(player) + diff * 7 + coord = np.array(player) + diff * 7 - if (coord[0] < 0): - coord[0] = 0 - elif (coord[0] > 63): - coord[0] = 63 + if coord[0] < 0: + coord[0] = 0 + elif coord[0] > 31: + coord[0] = 31 - if (coord[1] < 0): - coord[1] = 0 - elif (coord[1] > 63): - coord[1] = 63 + if coord[1] < 0: + coord[1] = 0 + elif coord[1] > 31: + coord[1] = 31 - new_action = [ - sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) - ] + new_action = [ + sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) + ] - elif (action == 4): #UP - coord = [player[0], player[1] - 3] - new_action = [ - sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) - ] + elif action == 4: # UP + coord = [player[0], player[1] - 3] + new_action = [ + sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) + ] - elif (action == 5): #DOWN - coord = [player[0], player[1] + 3] - new_action = [ - sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) - ] + elif action == 5: # DOWN + coord = [player[0], player[1] + 3] + new_action = [ + sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) + ] - elif (action == 6): #LEFT - coord = [player[0] - 3, player[1]] - new_action = [ - sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) - ] + elif action == 6: # LEFT + coord = [player[0] - 3, player[1]] + new_action = [ + sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) + ] - elif (action == 7): #RIGHT - coord = [player[0] + 3, player[1]] - new_action = [ - sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) - ] + elif action == 7: # RIGHT + coord = [player[0] + 3, player[1]] + new_action = [ + sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord]) + ] - #print("action : %s Back Coord : %s" % (action, coord)) + print("action : %s Back Coord : %s" % (action, coord)) - return obs, new_action + return obs, new_action diff --git a/common/vec_env/subproc_vec_env.py b/common/vec_env/subproc_vec_env.py index 6cddac0..16fcb99 100644 --- a/common/vec_env/subproc_vec_env.py +++ b/common/vec_env/subproc_vec_env.py @@ -41,10 +41,10 @@ def worker(remote, map_name, nscripts, i): action1 = data[0][0] action2 = data[0][1] - # func = actions.FUNCTIONS[action1[0]] - # print("agent(",i," ) action : ", action1, " func : ", func) + func = actions.FUNCTIONS[action1[0]] + print("agent(",i," ) action : ", action1, " func : ", func) func = actions.FUNCTIONS[action2[0]] - # print("agent(",i," ) action : ", action2, " func : ", func) + print("agent(",i," ) action : ", action2, " func : ", func) result = env.step(actions=[action1]) @@ -55,10 +55,10 @@ def worker(remote, map_name, nscripts, i): if len(action2[1]) == 2: x, y = action2[1][1] - # print("x, y:", x, y) + print("x, y:", x, y) - # if x == 0 and y == 0: - # move = False + if x == 0 and y == 0: + move = False if (331 in available_actions and move and not done): try: diff --git a/defeat_zerglings/dqfd.py b/defeat_zerglings/dqfd.py index ae86e8e..0f1aee7 100644 --- a/defeat_zerglings/dqfd.py +++ b/defeat_zerglings/dqfd.py @@ -50,65 +50,65 @@ class ActWrapper(object): - def __init__(self, act): - self._act = act - #self._act_params = act_params - - @staticmethod - def load(path, act_params, num_cpu=16): - with open(path, "rb") as f: - model_data = dill.load(f) - act = deepq.build_act(**act_params) - sess = U.make_session(num_cpu=num_cpu) - sess.__enter__() - with tempfile.TemporaryDirectory() as td: - arc_path = os.path.join(td, "packed.zip") - with open(arc_path, "wb") as f: - f.write(model_data) - - zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) - U.load_state(os.path.join(td, "model")) - - return ActWrapper(act) - - def __call__(self, *args, **kwargs): - return self._act(*args, **kwargs) - - def save(self, path): - """Save model to a pickle located at `path`""" - with tempfile.TemporaryDirectory() as td: - U.save_state(os.path.join(td, "model")) - arc_name = os.path.join(td, "packed.zip") - with zipfile.ZipFile(arc_name, 'w') as zipf: - for root, dirs, files in os.walk(td): - for fname in files: - file_path = os.path.join(root, fname) - if file_path != arc_name: - zipf.write(file_path, - os.path.relpath(file_path, td)) - with open(arc_name, "rb") as f: - model_data = f.read() - with open(path, "wb") as f: - dill.dump((model_data), f) + def __init__(self, act): + self._act = act + # self._act_params = act_params + + @staticmethod + def load(path, act_params, num_cpu=16): + with open(path, "rb") as f: + model_data = dill.load(f) + act = deepq.build_act(**act_params) + sess = U.make_session(num_cpu=num_cpu) + sess.__enter__() + with tempfile.TemporaryDirectory() as td: + arc_path = os.path.join(td, "packed.zip") + with open(arc_path, "wb") as f: + f.write(model_data) + + zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) + U.load_state(os.path.join(td, "model")) + + return ActWrapper(act) + + def __call__(self, *args, **kwargs): + return self._act(*args, **kwargs) + + def save(self, path): + """Save model to a pickle located at `path`""" + with tempfile.TemporaryDirectory() as td: + U.save_state(os.path.join(td, "model")) + arc_name = os.path.join(td, "packed.zip") + with zipfile.ZipFile(arc_name, 'w') as zipf: + for root, dirs, files in os.walk(td): + for fname in files: + file_path = os.path.join(root, fname) + if file_path != arc_name: + zipf.write(file_path, + os.path.relpath(file_path, td)) + with open(arc_name, "rb") as f: + model_data = f.read() + with open(path, "wb") as f: + dill.dump((model_data), f) def load(path, act_params, num_cpu=16): - """Load act function that was returned by learn function. + """Load act function that was returned by learn function. -Parameters ----------- -path: str - path to the act function pickle -num_cpu: int - number of cpus to use for executing the policy + Parameters + ---------- + path: str + path to the act function pickle + num_cpu: int + number of cpus to use for executing the policy -Returns -------- -act: ActWrapper - function that takes a batch of observations - and returns actions. -""" - return ActWrapper.load(path, num_cpu=num_cpu, act_params=act_params) + Returns + ------- + act: ActWrapper + function that takes a batch of observations + and returns actions. + """ + return ActWrapper.load(path, num_cpu=num_cpu, act_params=act_params) def learn(env, @@ -136,283 +136,273 @@ def learn(env, param_noise_threshold=0.05, callback=None, demo_replay=[]): - """Train a deepq model. - -Parameters -------- -env: pysc2.env.SC2Env - environment to train on -q_func: (tf.Variable, int, str, bool) -> tf.Variable - the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. -lr: float - learning rate for adam optimizer -max_timesteps: int - number of env steps to optimizer for -buffer_size: int - size of the replay buffer -exploration_fraction: float - fraction of entire training period over which the exploration rate is annealed -exploration_final_eps: float - final value of random action probability -train_freq: int - update the model every `train_freq` steps. - set to None to disable printing -batch_size: int - size of a batched sampled from replay buffer for training -print_freq: int - how often to print out training progress - set to None to disable printing -checkpoint_freq: int - how often to save the model. This is so that the best version is restored - at the end of the training. If you do not wish to restore the best version at - the end of the training set this variable to None. -learning_starts: int - how many steps of the model to collect transitions for before learning starts -gamma: float - discount factor -target_network_update_freq: int - update the target network every `target_network_update_freq` steps. -prioritized_replay: True - if True prioritized replay buffer will be used. -prioritized_replay_alpha: float - alpha parameter for prioritized replay buffer -prioritized_replay_beta0: float - initial value of beta for prioritized replay buffer -prioritized_replay_beta_iters: int - number of iterations over which beta will be annealed from initial value - to 1.0. If set to None equals to max_timesteps. -prioritized_replay_eps: float - epsilon to add to the TD errors when updating priorities. -num_cpu: int - number of cpus to use for training -callback: (locals, globals) -> None - function called at every steps with state of the algorithm. - If callback returns true training stops. - -Returns -------- -act: ActWrapper - Wrapper over act function. Adds ability to save it and load it. - See header of baselines/deepq/categorical.py for details on the act function. -""" - # Create all the functions necessary to train the model - - sess = U.make_session(num_cpu=num_cpu) - sess.__enter__() - - def make_obs_ph(name): - return BatchInput((1, 32, 32), name=name) - - act, train, update_target, debug = deepq.build_train( - make_obs_ph=make_obs_ph, - q_func=q_func, - num_actions=num_actions, - optimizer=tf.train.AdamOptimizer(learning_rate=lr), - gamma=gamma, - grad_norm_clipping=10) - act_params = { - 'make_obs_ph': make_obs_ph, - 'q_func': q_func, - 'num_actions': num_actions, - } - - # Create the replay buffer - if prioritized_replay: - replay_buffer = PrioritizedReplayBuffer( - buffer_size, alpha=prioritized_replay_alpha) - if prioritized_replay_beta_iters is None: - prioritized_replay_beta_iters = max_timesteps - beta_schedule = LinearSchedule( - prioritized_replay_beta_iters, - initial_p=prioritized_replay_beta0, - final_p=1.0) - else: - replay_buffer = ReplayBuffer(buffer_size) - beta_schedule = None - # Create the schedule for exploration starting from 1. - exploration = LinearSchedule( - schedule_timesteps=int(exploration_fraction * max_timesteps), - initial_p=1.0, - final_p=exploration_final_eps) - - # Initialize the parameters and copy them to the target network. - U.initialize() - update_target() - - episode_rewards = [0.0] - saved_mean_reward = None - - obs = env.reset() - # Select all marines first - - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - - screen = player_relative - - obs, xy_per_marine = common.init(env, obs) - - group_id = 0 - reset = True - with tempfile.TemporaryDirectory() as td: - model_saved = False - model_file = os.path.join(td, "model") - - for t in range(max_timesteps): - if callback is not None: - if callback(locals(), globals()): - break - # Take action and update exploration to the newest value - kwargs = {} - if not param_noise: - update_eps = exploration.value(t) - update_param_noise_threshold = 0. - else: - update_eps = 0. - if param_noise_threshold >= 0.: - update_param_noise_threshold = param_noise_threshold - else: - # Compute the threshold such that the KL divergence between perturbed and non-perturbed - # policy is comparable to eps-greedy exploration with eps = exploration.value(t). - # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 - # for detailed explanation. - update_param_noise_threshold = -np.log( - 1. - exploration.value(t) + - exploration.value(t) / float(num_actions)) - kwargs['reset'] = reset - kwargs[ - 'update_param_noise_threshold'] = update_param_noise_threshold - kwargs['update_param_noise_scale'] = True - - # custom process for DefeatZerglingsAndBanelings - - obs, screen, player = common.select_marine(env, obs) - - action = act( - np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] - reset = False - rew = 0 - - new_action = None - - obs, new_action = common.marine_action(env, obs, player, action) - army_count = env._obs[0].observation.player_common.army_count - - try: - if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]: - obs = env.step(actions=new_action) - else: - new_action = [sc2_actions.FunctionCall(_NO_OP, [])] - obs = env.step(actions=new_action) - except Exception as e: - #print(e) - 1 # Do nothing - - player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] - new_screen = player_relative - - rew += obs[0].reward - - done = obs[0].step_type == environment.StepType.LAST - - selected = obs[0].observation["feature_screen"][_SELECTED] - player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() - - if (len(player_y) > 0): - player = [int(player_x.mean()), int(player_y.mean())] - - if (len(player) == 2): - - if (player[0] > 32): - new_screen = common.shift(LEFT, player[0] - 32, new_screen) - elif (player[0] < 32): - new_screen = common.shift(RIGHT, 32 - player[0], - new_screen) - - if (player[1] > 32): - new_screen = common.shift(UP, player[1] - 32, new_screen) - elif (player[1] < 32): - new_screen = common.shift(DOWN, 32 - player[1], new_screen) - - # Store transition in the replay buffer. - replay_buffer.add(screen, action, rew, new_screen, float(done)) - screen = new_screen - - episode_rewards[-1] += rew - reward = episode_rewards[-1] - - if done: - print("Episode Reward : %s" % episode_rewards[-1]) - obs = env.reset() - player_relative = obs[0].observation["feature_screen"][ - _PLAYER_RELATIVE] - - screen = player_relative - - group_list = common.init(env, obs) - - # Select all marines first - #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) - episode_rewards.append(0.0) - - reset = True - - if t > learning_starts and t % train_freq == 0: - # Minimize the error in Bellman's equation on a batch sampled from replay buffer. - if prioritized_replay: - experience = replay_buffer.sample( - batch_size, beta=beta_schedule.value(t)) - (obses_t, actions, rewards, obses_tp1, dones, weights, - batch_idxes) = experience - else: - obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( - batch_size) - weights, batch_idxes = np.ones_like(rewards), None - td_errors = train(obses_t, actions, rewards, obses_tp1, dones, - weights) - if prioritized_replay: - new_priorities = np.abs(td_errors) + prioritized_replay_eps - replay_buffer.update_priorities(batch_idxes, - new_priorities) - - if t > learning_starts and t % target_network_update_freq == 0: - # Update target network periodically. - update_target() - - mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) - num_episodes = len(episode_rewards) - if done and print_freq is not None and len( - episode_rewards) % print_freq == 0: - logger.record_tabular("steps", t) - logger.record_tabular("episodes", num_episodes) - logger.record_tabular("reward", reward) - logger.record_tabular("mean 100 episode reward", - mean_100ep_reward) - logger.record_tabular("% time spent exploring", - int(100 * exploration.value(t))) - logger.dump_tabular() - - if (checkpoint_freq is not None and t > learning_starts - and num_episodes > 100 and t % checkpoint_freq == 0): - if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: - if print_freq is not None: - logger.log( - "Saving model due to mean reward increase: {} -> {}". - format(saved_mean_reward, mean_100ep_reward)) - U.save_state(model_file) - model_saved = True - saved_mean_reward = mean_100ep_reward - if model_saved: - if print_freq is not None: - logger.log("Restored model with mean reward: {}".format( - saved_mean_reward)) - U.load_state(model_file) - - return ActWrapper(act) + """Train a deepq model. + + Parameters + ------- + env: pysc2.env.SC2Env + environment to train on + q_func: (tf.Variable, int, str, bool) -> tf.Variable + the model that takes the following inputs: + observation_in: object + the output of observation placeholder + num_actions: int + number of actions + scope: str + reuse: bool + should be passed to outer variable scope + and returns a tensor of shape (batch_size, num_actions) with values of every action. + lr: float + learning rate for adam optimizer + max_timesteps: int + number of env steps to optimizer for + buffer_size: int + size of the replay buffer + exploration_fraction: float + fraction of entire training period over which the exploration rate is annealed + exploration_final_eps: float + final value of random action probability + train_freq: int + update the model every `train_freq` steps. + set to None to disable printing + batch_size: int + size of a batched sampled from replay buffer for training + print_freq: int + how often to print out training progress + set to None to disable printing + checkpoint_freq: int + how often to save the model. This is so that the best version is restored + at the end of the training. If you do not wish to restore the best version at + the end of the training set this variable to None. + learning_starts: int + how many steps of the model to collect transitions for before learning starts + gamma: float + discount factor + target_network_update_freq: int + update the target network every `target_network_update_freq` steps. + prioritized_replay: True + if True prioritized replay buffer will be used. + prioritized_replay_alpha: float + alpha parameter for prioritized replay buffer + prioritized_replay_beta0: float + initial value of beta for prioritized replay buffer + prioritized_replay_beta_iters: int + number of iterations over which beta will be annealed from initial value + to 1.0. If set to None equals to max_timesteps. + prioritized_replay_eps: float + epsilon to add to the TD errors when updating priorities. + num_cpu: int + number of cpus to use for training + callback: (locals, globals) -> None + function called at every steps with state of the algorithm. + If callback returns true training stops. + + Returns + ------- + act: ActWrapper + Wrapper over act function. Adds ability to save it and load it. + See header of baselines/deepq/categorical.py for details on the act function. + """ + # Create all the functions necessary to train the model + + sess = U.make_session(num_cpu=num_cpu) + sess.__enter__() + + def make_obs_ph(name): + return BatchInput((1, 32, 32), name=name) + + act, train, update_target, debug = deepq.build_train( + make_obs_ph=make_obs_ph, + q_func=q_func, + num_actions=num_actions, + optimizer=tf.train.AdamOptimizer(learning_rate=lr), + gamma=gamma, + grad_norm_clipping=10) + act_params = { + 'make_obs_ph': make_obs_ph, + 'q_func': q_func, + 'num_actions': num_actions, + } + + # Create the replay buffer + if prioritized_replay: + replay_buffer = PrioritizedReplayBuffer( + buffer_size, alpha=prioritized_replay_alpha) + if prioritized_replay_beta_iters is None: + prioritized_replay_beta_iters = max_timesteps + beta_schedule = LinearSchedule( + prioritized_replay_beta_iters, + initial_p=prioritized_replay_beta0, + final_p=1.0) + else: + replay_buffer = ReplayBuffer(buffer_size) + beta_schedule = None + # Create the schedule for exploration starting from 1. + exploration = LinearSchedule( + schedule_timesteps=int(exploration_fraction * max_timesteps), + initial_p=1.0, + final_p=exploration_final_eps) + + # Initialize the parameters and copy them to the target network. + U.initialize() + update_target() + + episode_rewards = [0.0] + saved_mean_reward = None + + obs = env.reset() + # Select all marines first + + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + + screen = player_relative + + obs, xy_per_marine = common.init(env, obs) + + group_id = 0 + reset = True + with tempfile.TemporaryDirectory() as td: + model_saved = False + model_file = os.path.join(td, "model") + + for t in range(max_timesteps): + if callback is not None: + if callback(locals(), globals()): + break + # Take action and update exploration to the newest value + kwargs = {} + if not param_noise: + update_eps = exploration.value(t) + update_param_noise_threshold = 0. + else: + update_eps = 0. + if param_noise_threshold >= 0.: + update_param_noise_threshold = param_noise_threshold + else: + # Compute the threshold such that the KL divergence between perturbed and non-perturbed + # policy is comparable to eps-greedy exploration with eps = exploration.value(t). + # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 + # for detailed explanation. + update_param_noise_threshold = -np.log( + 1. - exploration.value(t) + + exploration.value(t) / float(num_actions)) + kwargs['reset'] = reset + kwargs[ + 'update_param_noise_threshold'] = update_param_noise_threshold + kwargs['update_param_noise_scale'] = True + + # custom process for DefeatZerglingsAndBanelings + + obs, screen, player = common.select_marine(env, obs) + + action = act( + np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] + reset = False + rew = 0 + + new_action = None + + obs, new_action = common.marine_action(env, obs, player, action) + army_count = env._obs[0].observation.player_common.army_count + + try: + if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]: + obs = env.step(actions=new_action) + else: + new_action = [sc2_actions.FunctionCall(_NO_OP, [])] + obs = env.step(actions=new_action) + except Exception as e: + print(e) + rew += -10 + # 1 # Do nothing + + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] + new_screen = player_relative + + rew += obs[0].reward + + done = obs[0].step_type == environment.StepType.LAST + + selected = obs[0].observation["feature_screen"][_SELECTED] + player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() + + if (len(player_y) > 0): + player = [int(player_x.mean()), int(player_y.mean())] + + if (len(player) == 2): + + if (player[0] > 32): + new_screen = common.shift(LEFT, player[0] - 32, new_screen) + elif (player[0] < 32): + new_screen = common.shift(RIGHT, 32 - player[0], + new_screen) + + if (player[1] > 32): + new_screen = common.shift(UP, player[1] - 32, new_screen) + elif (player[1] < 32): + new_screen = common.shift(DOWN, 32 - player[1], new_screen) + + # Store transition in the replay buffer. + replay_buffer.add(screen, action, rew, new_screen, float(done)) + screen = new_screen + + episode_rewards[-1] += rew + reward = episode_rewards[-1] + + if done: + print("Episode Reward : %s" % episode_rewards[-1]) + obs = env.reset() + player_relative = obs[0].observation["feature_screen"][ + _PLAYER_RELATIVE] + + screen = player_relative + + group_list = common.init(env, obs) + + # Select all marines first + # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) + episode_rewards.append(0.0) + + reset = True + + if t > learning_starts and t % train_freq == 0: + # Minimize the error in Bellman's equation on a batch sampled from replay buffer. + if prioritized_replay: + experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) + (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience + else: + obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) + weights, batch_idxes = np.ones_like(rewards), None + td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) + if prioritized_replay: + new_priorities = np.abs(td_errors) + prioritized_replay_eps + replay_buffer.update_priorities(batch_idxes, new_priorities) + + if t > learning_starts and t % target_network_update_freq == 0: + # Update target network periodically. + update_target() + + mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) + num_episodes = len(episode_rewards) + if done and print_freq is not None and len(episode_rewards) % print_freq == 0: + logger.record_tabular("steps", t) + logger.record_tabular("episodes", num_episodes) + logger.record_tabular("reward", reward) + logger.record_tabular("mean 100 episode reward", mean_100ep_reward) + logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) + logger.dump_tabular() + + if checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0: + if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: + if print_freq is not None: + logger.log("Saving model due to mean reward increase: {} -> {}".format(saved_mean_reward, + mean_100ep_reward)) + U.save_state(model_file) + model_saved = True + saved_mean_reward = mean_100ep_reward + if model_saved: + if print_freq is not None: + logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) + U.load_state(model_file) + + return ActWrapper(act) diff --git a/train_defeat_zerglings.py b/train_defeat_zerglings.py index bc3d8f7..5eb3d79 100644 --- a/train_defeat_zerglings.py +++ b/train_defeat_zerglings.py @@ -1,3 +1,4 @@ +import pprint import sys import os import datetime @@ -140,7 +141,7 @@ def deepq_callback(locals, globals): def acktr_callback(locals, globals): global max_mean_reward, last_filename - #pprint.pprint(locals) + pprint.pprint(locals) if('mean_100ep_reward' in locals and locals['num_episodes'] >= 10 From 264b7435db92d7f53059bd4f446b808de1474280 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Thu, 25 Feb 2021 14:03:44 -0500 Subject: [PATCH 03/11] Reformatting whitespace in a2c.py --- a2c/a2c.py | 1378 ++++++++++++++--------------- common/common.py | 4 +- common/vec_env/subproc_vec_env.py | 6 +- deepq_mineral_shards.py | 3 +- defeat_zerglings/dqfd.py | 13 +- train_defeat_zerglings.py | 8 +- train_mineral_shards.py | 2 +- 7 files changed, 705 insertions(+), 709 deletions(-) diff --git a/a2c/a2c.py b/a2c/a2c.py index 2739292..5e2dbfa 100644 --- a/a2c/a2c.py +++ b/a2c/a2c.py @@ -24,606 +24,604 @@ np.set_printoptions(threshold=np.inf) + def mse(pred, target): - return tf.square(pred-target)/2. + return tf.square(pred - target) / 2. + class Model(object): - def __init__(self, - policy, - ob_space, - ac_space, - nenvs, - total_timesteps, - nprocs=32, - nscripts=16, - nsteps=20, - nstack=4, - ent_coef=0.1, - vf_coef=0.5, - vf_fisher_coef=1.0, - lr=0.25, - max_grad_norm=0.001, - kfac_clip=0.001, - lrschedule='linear', - alpha=0.99, - epsilon=1e-5): - config = tf.ConfigProto( - allow_soft_placement=True, - intra_op_parallelism_threads=nprocs, - inter_op_parallelism_threads=nprocs) - config.gpu_options.allow_growth = True - self.sess = sess = tf.Session(config=config) - nsml.bind(sess=sess) - #nact = ac_space.n - nbatch = nenvs * nsteps - A = tf.placeholder(tf.int32, [nbatch]) - - XY0 = tf.placeholder(tf.int32, [nbatch]) - XY1 = tf.placeholder(tf.int32, [nbatch]) - - # ADV == TD_TARGET - values - ADV = tf.placeholder(tf.float32, [nbatch]) - TD_TARGET = tf.placeholder(tf.float32, [nbatch]) - PG_LR = tf.placeholder(tf.float32, []) - VF_LR = tf.placeholder(tf.float32, []) - - self.model = step_model = policy( - sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) - self.model2 = train_model = policy( - sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) - - # Policy 1 : Base Action : train_model.pi label = A - - script_mask = tf.concat( - [ - tf.zeros([nscripts * nsteps, 1]), - tf.ones([(nprocs - nscripts) * nsteps, 1]) - ], - axis=0) - - pi = train_model.pi - pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 - pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) - neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=pi, labels=A) - neglogpac *= tf.stop_gradient(pac_weight) - - inv_A = 1.0 - tf.cast(A, tf.float32) - - xy0_mask = tf.cast(A, tf.float32) - xy1_mask = tf.cast(A, tf.float32) - - condition0 = tf.equal(xy0_mask, 2) - xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) - xy0_mask = 1.0 - xy0_mask - - condition1 = tf.equal(xy1_mask, 2) - xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) - - # One hot representation of chosen marine. - # [batch_size, 2] - pi_xy0 = train_model.pi_xy0 - pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 - pac_weight = tf.reduce_sum( - pac_weight * tf.one_hot(XY0, depth=1024), axis=1) - - logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=pi_xy0, labels=XY0) - logpac_xy0 *= tf.stop_gradient(pac_weight) - logpac_xy0 *= tf.cast(xy0_mask, tf.float32) - - pi_xy1 = train_model.pi_xy1 - pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 - pac_weight = tf.reduce_sum( - pac_weight * tf.one_hot(XY0, depth=1024), axis=1) - - # 1D? 2D? - logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=pi_xy1, labels=XY1) - logpac_xy1 *= tf.stop_gradient(pac_weight) - logpac_xy1 *= tf.cast(xy1_mask, tf.float32) - - pg_loss = tf.reduce_mean(ADV * neglogpac) - pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) - pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) - - vf_ = tf.squeeze(train_model.vf) - - vf_r = tf.concat( - [ - tf.ones([nscripts * nsteps, 1]), - tf.zeros([(nprocs - nscripts) * nsteps, 1]) - ], - axis=0) * TD_TARGET - vf_masked = vf_ * script_mask + vf_r - - #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] - - vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) - entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) - entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) - entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) - entropy = entropy_a + entropy_xy0 + entropy_xy1 - - loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef - - params = find_trainable_variables("model") - grads = tf.gradients(loss, params) - if max_grad_norm is not None: - grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) - grads = list(zip(grads, params)) - trainer = tf.train.RMSPropOptimizer( - learning_rate=lr, decay=alpha, epsilon=epsilon) - _train = trainer.apply_gradients(grads) - - self.logits = logits = train_model.pi - - # xy0 - - self.params_common = params_common = tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') - self.params_xy0 = params_xy0 = tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, - scope='model/xy0') + params_common - - train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss - - self.grads_check_xy0 = grads_xy0 = tf.gradients( - train_loss_xy0, params_xy0) - if max_grad_norm is not None: - grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) - - grads_xy0 = list(zip(grads_xy0, params_xy0)) - trainer_xy0 = tf.train.RMSPropOptimizer( - learning_rate=lr, decay=alpha, epsilon=epsilon) - _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) - - # xy1 - - self.params_xy1 = params_xy1 = tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, - scope='model/xy1') + params_common - - train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss - - self.grads_check_xy1 = grads_xy1 = tf.gradients( - train_loss_xy1, params_xy1) - if max_grad_norm is not None: - grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) - - grads_xy1 = list(zip(grads_xy1, params_xy1)) - trainer_xy1 = tf.train.RMSPropOptimizer( - learning_rate=lr, decay=alpha, epsilon=epsilon) - _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) - - self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) - - def train(obs, states, td_targets, masks, actions, xy0, xy1, values): - advs = td_targets - values - for step in range(len(obs)): - cur_lr = self.lr.value() - - td_map = { - train_model.X: obs, - A: actions, - XY0: xy0, - XY1: xy1, - ADV: advs, - TD_TARGET: td_targets, - PG_LR: cur_lr - } - if states != []: - td_map[train_model.S] = states - td_map[train_model.M] = masks - - policy_loss, value_loss, policy_entropy, _, \ - policy_loss_xy0, policy_entropy_xy0, _, \ - policy_loss_xy1, policy_entropy_xy1, _ = sess.run( - [pg_loss, vf_loss, entropy, _train, - pg_loss_xy0, entropy_xy0, _train_xy0, - pg_loss_xy1, entropy_xy1, _train_xy1], - td_map) - return policy_loss, value_loss, policy_entropy, \ - policy_loss_xy0, policy_entropy_xy0, \ - policy_loss_xy1, policy_entropy_xy1 - - def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - - self.train = train - self.save = save - self.load = load - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.value = step_model.value - self.initial_state = step_model.initial_state - print("global_variables_initializer start") - tf.global_variables_initializer().run(session=sess) - print("global_variables_initializer complete") + def __init__(self, + policy, + ob_space, + ac_space, + nenvs, + total_timesteps, + nprocs=32, + nscripts=16, + nsteps=20, + nstack=4, + ent_coef=0.1, + vf_coef=0.5, + vf_fisher_coef=1.0, + lr=0.25, + max_grad_norm=0.001, + kfac_clip=0.001, + lrschedule='linear', + alpha=0.99, + epsilon=1e-5): + config = tf.ConfigProto( + allow_soft_placement=True, + intra_op_parallelism_threads=nprocs, + inter_op_parallelism_threads=nprocs) + config.gpu_options.allow_growth = True + self.sess = sess = tf.Session(config=config) + nsml.bind(sess=sess) + # nact = ac_space.n + nbatch = nenvs * nsteps + A = tf.placeholder(tf.int32, [nbatch]) + + XY0 = tf.placeholder(tf.int32, [nbatch]) + XY1 = tf.placeholder(tf.int32, [nbatch]) + + # ADV == TD_TARGET - values + ADV = tf.placeholder(tf.float32, [nbatch]) + TD_TARGET = tf.placeholder(tf.float32, [nbatch]) + PG_LR = tf.placeholder(tf.float32, []) + VF_LR = tf.placeholder(tf.float32, []) + + self.model = step_model = policy( + sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) + self.model2 = train_model = policy( + sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) + + # Policy 1 : Base Action : train_model.pi label = A + + script_mask = tf.concat( + [ + tf.zeros([nscripts * nsteps, 1]), + tf.ones([(nprocs - nscripts) * nsteps, 1]) + ], + axis=0) + + pi = train_model.pi + pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) + neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=pi, labels=A) + neglogpac *= tf.stop_gradient(pac_weight) + + inv_A = 1.0 - tf.cast(A, tf.float32) + + xy0_mask = tf.cast(A, tf.float32) + xy1_mask = tf.cast(A, tf.float32) + + condition0 = tf.equal(xy0_mask, 2) + xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) + xy0_mask = 1.0 - xy0_mask + + condition1 = tf.equal(xy1_mask, 2) + xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) + + # One hot representation of chosen marine. + # [batch_size, 2] + pi_xy0 = train_model.pi_xy0 + pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 + pac_weight = tf.reduce_sum( + pac_weight * tf.one_hot(XY0, depth=1024), axis=1) + + logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=pi_xy0, labels=XY0) + logpac_xy0 *= tf.stop_gradient(pac_weight) + logpac_xy0 *= tf.cast(xy0_mask, tf.float32) + + pi_xy1 = train_model.pi_xy1 + pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 + pac_weight = tf.reduce_sum( + pac_weight * tf.one_hot(XY0, depth=1024), axis=1) + + # 1D? 2D? + logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=pi_xy1, labels=XY1) + logpac_xy1 *= tf.stop_gradient(pac_weight) + logpac_xy1 *= tf.cast(xy1_mask, tf.float32) + + pg_loss = tf.reduce_mean(ADV * neglogpac) + pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) + pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) + + vf_ = tf.squeeze(train_model.vf) + + vf_r = tf.concat( + [ + tf.ones([nscripts * nsteps, 1]), + tf.zeros([(nprocs - nscripts) * nsteps, 1]) + ], + axis=0) * TD_TARGET + vf_masked = vf_ * script_mask + vf_r + + # vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] + + vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) + entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) + entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) + entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) + entropy = entropy_a + entropy_xy0 + entropy_xy1 + + loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + + params = find_trainable_variables("model") + grads = tf.gradients(loss, params) + if max_grad_norm is not None: + grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) + grads = list(zip(grads, params)) + trainer = tf.train.RMSPropOptimizer( + learning_rate=lr, decay=alpha, epsilon=epsilon) + _train = trainer.apply_gradients(grads) + + self.logits = logits = train_model.pi + + # xy0 + + self.params_common = params_common = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') + self.params_xy0 = params_xy0 = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, + scope='model/xy0') + params_common + + train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss + + self.grads_check_xy0 = grads_xy0 = tf.gradients( + train_loss_xy0, params_xy0) + if max_grad_norm is not None: + grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) + + grads_xy0 = list(zip(grads_xy0, params_xy0)) + trainer_xy0 = tf.train.RMSPropOptimizer( + learning_rate=lr, decay=alpha, epsilon=epsilon) + _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) + + # xy1 + + self.params_xy1 = params_xy1 = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, + scope='model/xy1') + params_common + + train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss + + self.grads_check_xy1 = grads_xy1 = tf.gradients( + train_loss_xy1, params_xy1) + if max_grad_norm is not None: + grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) + + grads_xy1 = list(zip(grads_xy1, params_xy1)) + trainer_xy1 = tf.train.RMSPropOptimizer( + learning_rate=lr, decay=alpha, epsilon=epsilon) + _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) + + self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) + + def train(obs, states, td_targets, masks, actions, xy0, xy1, values): + advs = td_targets - values + for step in range(len(obs)): + cur_lr = self.lr.value() + + td_map = { + train_model.X: obs, + A: actions, + XY0: xy0, + XY1: xy1, + ADV: advs, + TD_TARGET: td_targets, + PG_LR: cur_lr + } + if states != []: + td_map[train_model.S] = states + td_map[train_model.M] = masks + + policy_loss, value_loss, policy_entropy, _, \ + policy_loss_xy0, policy_entropy_xy0, _, \ + policy_loss_xy1, policy_entropy_xy1, _ = sess.run( + [pg_loss, vf_loss, entropy, _train, + pg_loss_xy0, entropy_xy0, _train_xy0, + pg_loss_xy1, entropy_xy1, _train_xy1], + td_map) + return policy_loss, value_loss, policy_entropy, \ + policy_loss_xy0, policy_entropy_xy0, \ + policy_loss_xy1, policy_entropy_xy1 + + def save(save_path): + ps = sess.run(params) + joblib.dump(ps, save_path) + + def load(load_path): + loaded_params = joblib.load(load_path) + restores = [] + for p, loaded_p in zip(params, loaded_params): + restores.append(p.assign(loaded_p)) + sess.run(restores) + + self.train = train + self.save = save + self.load = load + self.train_model = train_model + self.step_model = step_model + self.step = step_model.step + self.value = step_model.value + self.initial_state = step_model.initial_state + print("global_variables_initializer start") + tf.global_variables_initializer().run(session=sess) + print("global_variables_initializer complete") class Runner(object): - def __init__(self, - env, - model, - nsteps, - nscripts, - nstack, - gamma, - callback=None): - self.env = env - self.model = model - nh, nw, nc = (32, 32, 3) - self.nsteps = nsteps - self.nscripts = nscripts - self.nenv = nenv = env.num_envs - self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack) - self.batch_coord_shape = (nenv * nsteps, 32) - self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) - self.available_actions = None - self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8) - obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset( - ) - self.xy_per_marine = [{"0":[0,0], "1":[0,0]} for _ in range(nenv)] - for env_num, data in enumerate(xy_per_marine): - self.xy_per_marine[env_num] = data - self.army_counts = army_counts - self.control_groups = control_groups - self.selected = selected - self.update_obs(obs) # (2,13,32,32) - self.update_available(available_actions) - self.gamma = gamma - self.states = model.initial_state - self.dones = [False for _ in range(nenv)] - self.total_reward = [0.0 for _ in range(nenv)] - self.episode_rewards = [] - self.episode_rewards_script = [] - self.episode_rewards_a2c = [] - self.episodes = 0 - self.steps = 0 - self.callback = callback - - self.action_queue = [[] for _ in range(nenv)] - self.group_list = [[] for _ in range(nenv)] - self.agent_state = ["IDLE" for _ in range(nenv)] - self.dest_per_marine = [{} for _ in range(nenv)] - - self.group_id = [0 for _ in range(nenv)] - - def update_obs(self, obs): # (self.nenv, 32, 32, 2) - obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) - self.obs = np.roll(self.obs, shift=-3, axis=3) - new_map = np.zeros((self.nenv, 32, 32, 3)) - new_map[:, :, :, -1] = obs[:, 0, :, :] - for env_num in range(self.nenv): - print("xy_per_marine: ", self.xy_per_marine) - if "0" not in self.xy_per_marine[env_num]: - self.xy_per_marine[env_num]["0"] = [0, 0] - if "1" not in self.xy_per_marine[env_num]: - self.xy_per_marine[env_num]["1"] = [0, 0] - - marine0 = self.xy_per_marine[env_num]["0"] - marine1 = self.xy_per_marine[env_num]["1"] - new_map[env_num, marine0[0], marine0[1], -3] = 1 - new_map[env_num, marine1[0], marine1[1], -2] = 1 - self.obs[:, :, :, -3:] = new_map - # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32) - - def update_available(self, _available_actions): - print("update_available : ", _available_actions) - self.available_actions = _available_actions - # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]]) - self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8) - for env_num, list in enumerate(_available_actions): - print("env_num :", env_num, " list :", list) - for action_num in list: - print("action_num :", action_num) - if (action_num == 4): - self.base_act_mask[env_num][0] = 1 - self.base_act_mask[env_num][1] = 1 - elif action_num == 0: - self.base_act_mask[env_num][2] = 1 - # elif(action_num == 331): - # self.base_act_mask[env_num][2] = 1 - - def valid_base_action(self, base_actions): - for env_num, list in enumerate(self.available_actions): - avail = [] - for action_num in list: - if (action_num == 4): - avail.append(0) - avail.append(1) - elif action_num == 0: - avail.append(2) - # elif(action_num == 331): - # avail.append(2) - - if base_actions[env_num] not in avail: - print("env_num", env_num, " argmax is not valid. random pick ", - avail) - base_actions[env_num] = np.random.choice(avail) - - return base_actions - - def trans_base_actions(self, base_actions): - new_base_actions = np.copy(base_actions) - for env_num, ba in enumerate(new_base_actions): - if (ba == 0): - new_base_actions[env_num] = 4 # move marine control group 0 - elif (ba == 1): - new_base_actions[env_num] = 4 # move marine control group 1 - elif (ba == 2): - new_base_actions[env_num] = 0 # move marine control group 1 - # elif(ba==2): - # new_base_actions[env_num] = 331 # move marine xy0 - - return new_base_actions - - - def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1): - actions = [] - for env_num, spec in enumerate(base_action_spec): - # print("spec", spec.args) - args = [] - # for arg_idx, arg in enumerate(spec.args): - # #print("arg", arg) - # #print("arg.id", arg.id) - # if(arg.id==0): # screen (32,32) x0, y0 - # args.append([int(x0[env_num]), int(y0[env_num])]) - # # elif(arg.id==1): # minimap (32,32) x1, y1 - # # args.append([int(x1[env_num]), int(y1[env_num])]) - # # elif(arg.id==2): # screen2 (32,32) x2, y2 - # # args.append([int(x2[env_num]), y2[env_num]]) - # elif(arg.id==3): # pi3 queued (2) - # args.append([int(0)]) - # elif(arg.id==4): # pi4 control_group_act (5) - # args.append([_CONTROL_GROUP_RECALL]) - # elif(arg.id==5): # pi5 control_group_id 10 - # args.append([int(base_actions[env_num])]) # 0 => cg 0 / 1 => cg 1 - # # elif(arg.id==6): # pi6 select_point_act 4 - # # args.append([int(sub6[env_num])]) - # # elif(arg.id==7): # pi7 select_add 2 - # # args.append([int(sub7[env_num])]) - # # elif(arg.id==8): # pi8 select_unit_act 4 - # # args.append([int(sub8[env_num])]) - # # elif(arg.id==9): # pi9 select_unit_id 500 - # # args.append([int(sub9[env_num])]) - # # elif(arg.id==10): # pi10 select_worker 4 - # # args.append([int(sub10[env_num])]) - # # elif(arg.id==11): # pi11 build_queue_id 10 - # # args.append([int(sub11[env_num])]) - # # elif(arg.id==12): # pi12 unload_id 500 - # # args.append([int(sub12[env_num])]) - # else: - # raise NotImplementedError("cannot construct this arg", spec.args) - two_action = [] - if base_actions[env_num] == 0: - two_action.append( - sc2_actions.FunctionCall( - 4, - [[_CONTROL_GROUP_RECALL], [0]] - )) - - two_action.append( - sc2_actions.FunctionCall( - 331, - [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]])) - - elif base_actions[env_num] == 1: - two_action.append( - sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]])) - two_action.append( - sc2_actions.FunctionCall( - 331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]])) - elif base_actions[env_num] == 2: - two_action.append( - sc2_actions.FunctionCall(0, [])) - two_action.append( - sc2_actions.FunctionCall(0, [])) - - #action = sc2_actions.FunctionCall(a, args) - actions.append(two_action) - - return actions - - def run(self): - mb_obs, mb_td_targets, mb_base_actions, \ - mb_xy0, mb_xy1, \ - mb_values, mb_dones \ - = [], [], [], [], [], [], [] - - mb_states = self.states - for n in range(self.nsteps): - # pi, pi2, x1, y1, x2, y2, v0 - pi1, pi_xy0, pi_xy1, values, states = self.model.step( - self.obs, self.states, self.dones) - - pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 - - base_actions = np.argmax( - pi1 * self.base_act_mask + pi1_noise, axis=1) - xy0 = np.argmax(pi_xy0, axis=1) - - x0 = (xy0 % 32).astype(int) - y0 = (xy0 / 32).astype(int) - - xy1 = np.argmax(pi_xy1, axis=1) - x1 = (xy1 % 32).astype(int) - y1 = (xy1 / 32).astype(int) - - # Scripted Agent Hacking - - for env_num in range(self.nenv): - if env_num >= self.nscripts: # only for scripted agents - continue - - ob = self.obs[env_num, :, :, :] - player_relative = ob[:, :, -1] - - self.group_list[env_num] = common.update_group_list2( - self.control_groups[env_num]) - - if len(self.action_queue[env_num]) == 0: - - self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \ - common.solve_tsp(player_relative, - self.selected[env_num][0], - self.group_list[env_num], - self.group_id[env_num], - self.dest_per_marine[env_num], - self.xy_per_marine[env_num]) - - base_actions[env_num] = 0 - x0[env_num] = 0 - y0[env_num] = 0 - x1[env_num] = 0 - y1[env_num] = 0 - - if len(self.action_queue[env_num]) > 0: - action = self.action_queue[env_num].pop(0) - base_actions[env_num] = action.get("base_action", 0) - - x0[env_num] = action.get("x0", 0) - y0[env_num] = action.get("y0", 0) - xy0[env_num] = y0[env_num] * 32 + x0[env_num] - - x1[env_num] = action.get("x1", 0) - y1[env_num] = action.get("y1", 0) - xy1[env_num] = y1[env_num] * 32 + x1[env_num] - - base_actions = self.valid_base_action(base_actions) - new_base_actions = self.trans_base_actions(base_actions) - - base_action_spec = self.env.action_spec(new_base_actions) - # print("base_actions:", base_actions) - actions = self.construct_action( - base_actions, - base_action_spec, - x0, - y0, - x1, - y1 - ) - - mb_obs.append(np.copy(self.obs)) - mb_base_actions.append(base_actions) - - mb_xy0.append(xy0) - mb_xy1.append(xy1) - mb_values.append(values) - mb_dones.append(self.dones) - - #print("final acitons : ", actions) - obs, rewards, dones,\ - available_actions, army_counts,\ - control_groups, selected, xy_per_marine\ - = self.env.step( - actions=actions) - self.army_counts = army_counts - self.control_groups = control_groups - self.selected = selected - for env_num, data in enumerate(xy_per_marine): - self.xy_per_marine[env_num] = data - self.update_available(available_actions) - - self.states = states - self.dones = dones - mean_100ep_reward_a2c = 0 - for n, done in enumerate(dones): - self.total_reward[n] += float(rewards[n]) - if done: - self.obs[n] = self.obs[n] * 0 - self.episodes += 1 - num_episodes = self.episodes - self.episode_rewards.append(self.total_reward[n]) - - model = self.model - mean_100ep_reward = round( - np.mean(self.episode_rewards[-101:]), 1) - if (n < self.nscripts): # scripted agents - self.episode_rewards_script.append( - self.total_reward[n]) - mean_100ep_reward_script = round( - np.mean(self.episode_rewards_script[-101:]), 1) - nsml.report( - reward_script=self.total_reward[n], - mean_reward_script=mean_100ep_reward_script, - reward=self.total_reward[n], - mean_100ep_reward=mean_100ep_reward, - episodes=self.episodes, - step=self.episodes, - scope=locals() - ) - else: - self.episode_rewards_a2c.append(self.total_reward[n]) - mean_100ep_reward_a2c = round( - np.mean(self.episode_rewards_a2c[-101:]), 1) - nsml.report( - reward_a2c=self.total_reward[n], - mean_reward_a2c=mean_100ep_reward_a2c, - reward=self.total_reward[n], - mean_100ep_reward=mean_100ep_reward, - episodes=self.episodes, - step=self.episodes, - scope=locals() + def __init__(self, + env, + model, + nsteps, + nscripts, + nstack, + gamma, + callback=None): + self.env = env + self.model = model + nh, nw, nc = (32, 32, 3) + self.nsteps = nsteps + self.nscripts = nscripts + self.nenv = nenv = env.num_envs + self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack) + self.batch_coord_shape = (nenv * nsteps, 32) + self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) + self.available_actions = None + self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8) + obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset() + self.xy_per_marine = [{"0": [0, 0], "1": [0, 0]} for _ in range(nenv)] + for env_num, data in enumerate(xy_per_marine): + self.xy_per_marine[env_num] = data + self.army_counts = army_counts + self.control_groups = control_groups + self.selected = selected + self.update_obs(obs) # (2,13,32,32) + self.update_available(available_actions) + self.gamma = gamma + self.states = model.initial_state + self.dones = [False for _ in range(nenv)] + self.total_reward = [0.0 for _ in range(nenv)] + self.episode_rewards = [] + self.episode_rewards_script = [] + self.episode_rewards_a2c = [] + self.episodes = 0 + self.steps = 0 + self.callback = callback + + self.action_queue = [[] for _ in range(nenv)] + self.group_list = [[] for _ in range(nenv)] + self.agent_state = ["IDLE" for _ in range(nenv)] + self.dest_per_marine = [{} for _ in range(nenv)] + + self.group_id = [0 for _ in range(nenv)] + + def update_obs(self, obs): # (self.nenv, 32, 32, 2) + obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) + self.obs = np.roll(self.obs, shift=-3, axis=3) + new_map = np.zeros((self.nenv, 32, 32, 3)) + new_map[:, :, :, -1] = obs[:, 0, :, :] + for env_num in range(self.nenv): + # print("xy_per_marine: ", self.xy_per_marine) + if "0" not in self.xy_per_marine[env_num]: + self.xy_per_marine[env_num]["0"] = [0, 0] + if "1" not in self.xy_per_marine[env_num]: + self.xy_per_marine[env_num]["1"] = [0, 0] + + marine0 = self.xy_per_marine[env_num]["0"] + marine1 = self.xy_per_marine[env_num]["1"] + new_map[env_num, marine0[0], marine0[1], -3] = 1 + new_map[env_num, marine1[0], marine1[1], -2] = 1 + self.obs[:, :, :, -3:] = new_map + # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32) + + def update_available(self, _available_actions): + # print("update_available : ", _available_actions) + self.available_actions = _available_actions + # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]]) + self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8) + for env_num, list in enumerate(_available_actions): + # print("env_num :", env_num, " list :", list) + for action_num in list: + # print("action_num :", action_num) + if (action_num == 4): + self.base_act_mask[env_num][0] = 1 + self.base_act_mask[env_num][1] = 1 + elif action_num == 0: + self.base_act_mask[env_num][2] = 1 + # elif(action_num == 331): + # self.base_act_mask[env_num][2] = 1 + + def valid_base_action(self, base_actions): + for env_num, list in enumerate(self.available_actions): + avail = [] + for action_num in list: + if (action_num == 4): + avail.append(0) + avail.append(1) + elif action_num == 0: + avail.append(2) + # elif(action_num == 331): + # avail.append(2) + + if base_actions[env_num] not in avail: + # print("env_num", env_num, " argmax is not valid. random pick ", avail) + base_actions[env_num] = np.random.choice(avail) + + return base_actions + + def trans_base_actions(self, base_actions): + new_base_actions = np.copy(base_actions) + for env_num, ba in enumerate(new_base_actions): + if (ba == 0): + new_base_actions[env_num] = 4 # move marine control group 0 + elif (ba == 1): + new_base_actions[env_num] = 4 # move marine control group 1 + elif (ba == 2): + new_base_actions[env_num] = 0 # move marine control group 1 + # elif(ba==2): + # new_base_actions[env_num] = 331 # move marine xy0 + + return new_base_actions + + def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1): + actions = [] + for env_num, spec in enumerate(base_action_spec): + # print("spec", spec.args) + args = [] + # for arg_idx, arg in enumerate(spec.args): + # #print("arg", arg) + # #print("arg.id", arg.id) + # if(arg.id==0): # screen (32,32) x0, y0 + # args.append([int(x0[env_num]), int(y0[env_num])]) + # # elif(arg.id==1): # minimap (32,32) x1, y1 + # # args.append([int(x1[env_num]), int(y1[env_num])]) + # # elif(arg.id==2): # screen2 (32,32) x2, y2 + # # args.append([int(x2[env_num]), y2[env_num]]) + # elif(arg.id==3): # pi3 queued (2) + # args.append([int(0)]) + # elif(arg.id==4): # pi4 control_group_act (5) + # args.append([_CONTROL_GROUP_RECALL]) + # elif(arg.id==5): # pi5 control_group_id 10 + # args.append([int(base_actions[env_num])]) # 0 => cg 0 / 1 => cg 1 + # # elif(arg.id==6): # pi6 select_point_act 4 + # # args.append([int(sub6[env_num])]) + # # elif(arg.id==7): # pi7 select_add 2 + # # args.append([int(sub7[env_num])]) + # # elif(arg.id==8): # pi8 select_unit_act 4 + # # args.append([int(sub8[env_num])]) + # # elif(arg.id==9): # pi9 select_unit_id 500 + # # args.append([int(sub9[env_num])]) + # # elif(arg.id==10): # pi10 select_worker 4 + # # args.append([int(sub10[env_num])]) + # # elif(arg.id==11): # pi11 build_queue_id 10 + # # args.append([int(sub11[env_num])]) + # # elif(arg.id==12): # pi12 unload_id 500 + # # args.append([int(sub12[env_num])]) + # else: + # raise NotImplementedError("cannot construct this arg", spec.args) + two_action = [] + if base_actions[env_num] == 0: + two_action.append( + sc2_actions.FunctionCall( + 4, + [[_CONTROL_GROUP_RECALL], [0]] + )) + + two_action.append( + sc2_actions.FunctionCall( + 331, + [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]])) + + elif base_actions[env_num] == 1: + two_action.append( + sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]])) + two_action.append( + sc2_actions.FunctionCall( + 331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]])) + elif base_actions[env_num] == 2: + two_action.append( + sc2_actions.FunctionCall(0, [])) + two_action.append( + sc2_actions.FunctionCall(0, [])) + + # action = sc2_actions.FunctionCall(a, args) + actions.append(two_action) + + return actions + + def run(self): + mb_obs, mb_td_targets, mb_base_actions, \ + mb_xy0, mb_xy1, \ + mb_values, mb_dones \ + = [], [], [], [], [], [], [] + + mb_states = self.states + for n in range(self.nsteps): + # pi, pi2, x1, y1, x2, y2, v0 + pi1, pi_xy0, pi_xy1, values, states = self.model.step( + self.obs, self.states, self.dones) + + pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 + + base_actions = np.argmax( + pi1 * self.base_act_mask + pi1_noise, axis=1) + xy0 = np.argmax(pi_xy0, axis=1) + + x0 = (xy0 % 32).astype(int) + y0 = (xy0 / 32).astype(int) + + xy1 = np.argmax(pi_xy1, axis=1) + x1 = (xy1 % 32).astype(int) + y1 = (xy1 / 32).astype(int) + + # Scripted Agent Hacking + + for env_num in range(self.nenv): + if env_num >= self.nscripts: # only for scripted agents + continue + + ob = self.obs[env_num, :, :, :] + player_relative = ob[:, :, -1] + + self.group_list[env_num] = common.update_group_list2( + self.control_groups[env_num]) + + if len(self.action_queue[env_num]) == 0: + self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], \ + self.xy_per_marine[env_num] = \ + common.solve_tsp(player_relative, + self.selected[env_num][0], + self.group_list[env_num], + self.group_id[env_num], + self.dest_per_marine[env_num], + self.xy_per_marine[env_num]) + + base_actions[env_num] = 0 + x0[env_num] = 0 + y0[env_num] = 0 + x1[env_num] = 0 + y1[env_num] = 0 + + if len(self.action_queue[env_num]) > 0: + action = self.action_queue[env_num].pop(0) + base_actions[env_num] = action.get("base_action", 0) + + x0[env_num] = action.get("x0", 0) + y0[env_num] = action.get("y0", 0) + xy0[env_num] = y0[env_num] * 32 + x0[env_num] + + x1[env_num] = action.get("x1", 0) + y1[env_num] = action.get("y1", 0) + xy1[env_num] = y1[env_num] * 32 + x1[env_num] + + base_actions = self.valid_base_action(base_actions) + new_base_actions = self.trans_base_actions(base_actions) + + base_action_spec = self.env.action_spec(new_base_actions) + # print("base_actions:", base_actions) + actions = self.construct_action( + base_actions, + base_action_spec, + x0, + y0, + x1, + y1 ) - print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) - if self.callback is not None: - self.callback(locals(), globals()) - self.total_reward[n] = 0 - self.group_list[n] = [] - - - self.update_obs(obs) - mb_td_targets.append(rewards) - mb_dones.append(self.dones) - #batch of steps to batch of rollouts - mb_obs = np.asarray( - mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( + mb_obs.append(np.copy(self.obs)) + mb_base_actions.append(base_actions) + + mb_xy0.append(xy0) + mb_xy1.append(xy1) + mb_values.append(values) + mb_dones.append(self.dones) + + # print("final acitons : ", actions) + obs, rewards, dones, \ + available_actions, army_counts, \ + control_groups, selected, xy_per_marine \ + = self.env.step( + actions=actions) + self.army_counts = army_counts + self.control_groups = control_groups + self.selected = selected + for env_num, data in enumerate(xy_per_marine): + self.xy_per_marine[env_num] = data + self.update_available(available_actions) + + self.states = states + self.dones = dones + mean_100ep_reward_a2c = 0 + for n, done in enumerate(dones): + self.total_reward[n] += float(rewards[n]) + if done: + self.obs[n] = self.obs[n] * 0 + self.episodes += 1 + num_episodes = self.episodes + self.episode_rewards.append(self.total_reward[n]) + + model = self.model + mean_100ep_reward = round( + np.mean(self.episode_rewards[-101:]), 1) + if (n < self.nscripts): # scripted agents + self.episode_rewards_script.append( + self.total_reward[n]) + mean_100ep_reward_script = round( + np.mean(self.episode_rewards_script[-101:]), 1) + nsml.report( + reward_script=self.total_reward[n], + mean_reward_script=mean_100ep_reward_script, + reward=self.total_reward[n], + mean_100ep_reward=mean_100ep_reward, + episodes=self.episodes, + step=self.episodes, + scope=locals() + ) + else: + self.episode_rewards_a2c.append(self.total_reward[n]) + mean_100ep_reward_a2c = round( + np.mean(self.episode_rewards_a2c[-101:]), 1) + nsml.report( + reward_a2c=self.total_reward[n], + mean_reward_a2c=mean_100ep_reward_a2c, + reward=self.total_reward[n], + mean_100ep_reward=mean_100ep_reward, + episodes=self.episodes, + step=self.episodes, + scope=locals() + ) + print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) + + if self.callback is not None: + self.callback(locals(), globals()) + self.total_reward[n] = 0 + self.group_list[n] = [] + + self.update_obs(obs) + mb_td_targets.append(rewards) + mb_dones.append(self.dones) + # batch of steps to batch of rollouts + mb_obs = np.asarray( + mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) - mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) - mb_base_actions = np.asarray( - mb_base_actions, dtype=np.int32).swapaxes(1, 0) - - mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) - mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) - - mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - mb_masks = mb_dones[:, :-1] - mb_dones = mb_dones[:, 1:] - last_values = self.model.value(self.obs, self.states, - self.dones).tolist() - #discount/bootstrap off value fn - for n, (rewards, dones, value) in enumerate( - zip(mb_td_targets, mb_dones, last_values)): - rewards = rewards.tolist() - dones = dones.tolist() - if dones[-1] == 0: - rewards = discount_with_dones(rewards + [value], dones + [0], - self.gamma)[:-1] - else: - rewards = discount_with_dones(rewards, dones, self.gamma) - mb_td_targets[n] = rewards - mb_td_targets = mb_td_targets.flatten() - mb_base_actions = mb_base_actions.flatten() - mb_xy0 = mb_xy0.flatten() - mb_xy1 = mb_xy1.flatten() - - mb_values = mb_values.flatten() - mb_masks = mb_masks.flatten() - return mb_obs, mb_states, mb_td_targets, mb_masks, \ - mb_base_actions, mb_xy0, mb_xy1, mb_values + mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) + mb_base_actions = np.asarray( + mb_base_actions, dtype=np.int32).swapaxes(1, 0) + + mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) + mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) + + mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) + mb_masks = mb_dones[:, :-1] + mb_dones = mb_dones[:, 1:] + last_values = self.model.value(self.obs, self.states, + self.dones).tolist() + # discount/bootstrap off value fn + for n, (rewards, dones, value) in enumerate( + zip(mb_td_targets, mb_dones, last_values)): + rewards = rewards.tolist() + dones = dones.tolist() + if dones[-1] == 0: + rewards = discount_with_dones(rewards + [value], dones + [0], + self.gamma)[:-1] + else: + rewards = discount_with_dones(rewards, dones, self.gamma) + mb_td_targets[n] = rewards + mb_td_targets = mb_td_targets.flatten() + mb_base_actions = mb_base_actions.flatten() + mb_xy0 = mb_xy0.flatten() + mb_xy1 = mb_xy1.flatten() + + mb_values = mb_values.flatten() + mb_masks = mb_masks.flatten() + return mb_obs, mb_states, mb_td_targets, mb_masks, \ + mb_base_actions, mb_xy0, mb_xy1, mb_values def learn(policy, @@ -645,102 +643,102 @@ def learn(policy, save_interval=None, lrschedule='linear', callback=None): - tf.reset_default_graph() - set_global_seeds(seed) - - nenvs = nprocs - ob_space = (32, 32, 3) # env.observation_space - ac_space = (32, 32) - make_model = lambda: Model(policy, ob_space, ac_space, nenvs, - total_timesteps, - nprocs=nprocs, - nscripts=nscripts, - nsteps=nsteps, - nstack=nstack, - ent_coef=ent_coef, - vf_coef=vf_coef, - vf_fisher_coef=vf_fisher_coef, - lr=lr, - max_grad_norm=max_grad_norm, - kfac_clip=kfac_clip, - lrschedule=lrschedule) - - if save_interval and logger.get_dir(): - import cloudpickle - with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: - fh.write(cloudpickle.dumps(make_model)) - model = make_model() - print("make_model complete!") - runner = Runner( - env, - model, - nsteps=nsteps, - nscripts=nscripts, - nstack=nstack, - gamma=gamma, - callback=callback) - nbatch = nenvs * nsteps - tstart = time.time() - # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) - for update in range(1, total_timesteps // nbatch + 1): - obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run() - - policy_loss, value_loss, policy_entropy, \ - policy_loss_xy0, policy_entropy_xy0, \ - policy_loss_xy1, policy_entropy_xy1, \ - = model.train(obs, states, td_targets, - masks, actions, - xy0, xy1, values) - - model.old_obs = obs - nseconds = time.time() - tstart - fps = int((update * nbatch) / nseconds) - if update % log_interval == 0 or update == 1: - ev = explained_variance(values, td_targets) - # nsml.report( - # nupdates=update, - # total_timesteps=update * nbatch, - # fps=fps, - # policy_entropy=float(policy_entropy), - # policy_loss=float(policy_loss), - - # policy_loss_xy0=float(policy_loss_xy0), - # policy_entropy_xy0=float(policy_entropy_xy0), - - # policy_loss_xy1=float(policy_loss_xy1), - # policy_entropy_xy1=float(policy_entropy_xy1), - - # value_loss=float(value_loss), - # explained_variance=float(ev), - - # batch_size=nbatch, - # step=update, - - # scope=locals() - # ) - # logger.record_tabular("nupdates", update) - # logger.record_tabular("total_timesteps", update * nbatch) - # logger.record_tabular("fps", fps) - # logger.record_tabular("policy_entropy", float(policy_entropy)) - # logger.record_tabular("policy_loss", float(policy_loss)) - - # logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0)) - # logger.record_tabular("policy_entropy_xy0", - # float(policy_entropy_xy0)) - # logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1)) - # logger.record_tabular("policy_entropy_xy1", - # float(policy_entropy_xy1)) - # # logger.record_tabular("policy_loss_y0", float(policy_loss_y0)) - # # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0)) - - # logger.record_tabular("value_loss", float(value_loss)) - # logger.record_tabular("explained_variance", float(ev)) - # logger.dump_tabular() - - if save_interval and (update % save_interval == 0 - or update == 1) and logger.get_dir(): - savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) - print('Saving to', savepath) - model.save(savepath) - - env.close() + tf.reset_default_graph() + set_global_seeds(seed) + + nenvs = nprocs + ob_space = (32, 32, 3) # env.observation_space + ac_space = (32, 32) + make_model = lambda: Model(policy, ob_space, ac_space, nenvs, + total_timesteps, + nprocs=nprocs, + nscripts=nscripts, + nsteps=nsteps, + nstack=nstack, + ent_coef=ent_coef, + vf_coef=vf_coef, + vf_fisher_coef=vf_fisher_coef, + lr=lr, + max_grad_norm=max_grad_norm, + kfac_clip=kfac_clip, + lrschedule=lrschedule) + + if save_interval and logger.get_dir(): + import cloudpickle + with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: + fh.write(cloudpickle.dumps(make_model)) + model = make_model() + print("make_model complete!") + runner = Runner( + env, + model, + nsteps=nsteps, + nscripts=nscripts, + nstack=nstack, + gamma=gamma, + callback=callback) + nbatch = nenvs * nsteps + tstart = time.time() + # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) + for update in range(1, total_timesteps // nbatch + 1): + obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run() + + policy_loss, value_loss, policy_entropy, \ + policy_loss_xy0, policy_entropy_xy0, \ + policy_loss_xy1, policy_entropy_xy1, \ + = model.train(obs, states, td_targets, + masks, actions, + xy0, xy1, values) + + model.old_obs = obs + nseconds = time.time() - tstart + fps = int((update * nbatch) / nseconds) + if update % log_interval == 0 or update == 1: + ev = explained_variance(values, td_targets) + # nsml.report( + # nupdates=update, + # total_timesteps=update * nbatch, + # fps=fps, + # policy_entropy=float(policy_entropy), + # policy_loss=float(policy_loss), + + # policy_loss_xy0=float(policy_loss_xy0), + # policy_entropy_xy0=float(policy_entropy_xy0), + + # policy_loss_xy1=float(policy_loss_xy1), + # policy_entropy_xy1=float(policy_entropy_xy1), + + # value_loss=float(value_loss), + # explained_variance=float(ev), + + # batch_size=nbatch, + # step=update, + + # scope=locals() + # ) + # logger.record_tabular("nupdates", update) + # logger.record_tabular("total_timesteps", update * nbatch) + # logger.record_tabular("fps", fps) + # logger.record_tabular("policy_entropy", float(policy_entropy)) + # logger.record_tabular("policy_loss", float(policy_loss)) + + # logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0)) + # logger.record_tabular("policy_entropy_xy0", + # float(policy_entropy_xy0)) + # logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1)) + # logger.record_tabular("policy_entropy_xy1", + # float(policy_entropy_xy1)) + # # logger.record_tabular("policy_loss_y0", float(policy_loss_y0)) + # # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0)) + + # logger.record_tabular("value_loss", float(value_loss)) + # logger.record_tabular("explained_variance", float(ev)) + # logger.dump_tabular() + + if save_interval and (update % save_interval == 0 + or update == 1) and logger.get_dir(): + savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) + print('Saving to', savepath) + model.save(savepath) + + env.close() diff --git a/common/common.py b/common/common.py index 73dbd9f..1cda3c6 100644 --- a/common/common.py +++ b/common/common.py @@ -420,12 +420,12 @@ def check_group_list(env, obs): if group[0] == 48: army_count += group[1] if group[1] != 1: - print("group error group_id : %s count : %s" % (id, group[1])) + # print("group error group_id : %s count : %s" % (id, group[1])) error = True return error if army_count != env._obs[0].observation.player_common.army_count: error = True - print("army_count %s != %s env._obs.observation.player_common.army_count " % (army_count, env._obs[0].observation.player_common.army_count)) + # print("army_count %s != %s env._obs.observation.player_common.army_count " % (army_count, env._obs[0].observation.player_common.army_count)) return error diff --git a/common/vec_env/subproc_vec_env.py b/common/vec_env/subproc_vec_env.py index 16fcb99..56da9a0 100644 --- a/common/vec_env/subproc_vec_env.py +++ b/common/vec_env/subproc_vec_env.py @@ -42,9 +42,9 @@ def worker(remote, map_name, nscripts, i): action1 = data[0][0] action2 = data[0][1] func = actions.FUNCTIONS[action1[0]] - print("agent(",i," ) action : ", action1, " func : ", func) + # print("agent(",i," ) action : ", action1, " func : ", func) func = actions.FUNCTIONS[action2[0]] - print("agent(",i," ) action : ", action2, " func : ", func) + # print("agent(",i," ) action : ", action2, " func : ", func) result = env.step(actions=[action1]) @@ -55,7 +55,7 @@ def worker(remote, map_name, nscripts, i): if len(action2[1]) == 2: x, y = action2[1][1] - print("x, y:", x, y) + # print("x, y:", x, y) if x == 0 and y == 0: move = False diff --git a/deepq_mineral_shards.py b/deepq_mineral_shards.py index d01cf01..ba231e7 100644 --- a/deepq_mineral_shards.py +++ b/deepq_mineral_shards.py @@ -327,7 +327,7 @@ def make_obs_ph(name): rew = obs[0].reward - done = obs[0].step_type == environment.StepType.LAST + done = obs[0].step_type == environment.StepType.LAST or obs[0].step_type == environment.StepType.FIRST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) @@ -349,7 +349,6 @@ def make_obs_ph(name): # Select all marines first env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) episode_rewards.append(0.0) - #episode_minerals.append(0.0) reset = True diff --git a/defeat_zerglings/dqfd.py b/defeat_zerglings/dqfd.py index 0f1aee7..7a40701 100644 --- a/defeat_zerglings/dqfd.py +++ b/defeat_zerglings/dqfd.py @@ -115,15 +115,15 @@ def learn(env, q_func, num_actions=3, lr=5e-4, - max_timesteps=100000, - buffer_size=50000, + max_timesteps=10000, + buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, - checkpoint_freq=10000, - learning_starts=1000, + checkpoint_freq=1000, + learning_starts=100, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, @@ -352,8 +352,7 @@ def make_obs_ph(name): if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() - player_relative = obs[0].observation["feature_screen"][ - _PLAYER_RELATIVE] + player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] screen = player_relative @@ -373,7 +372,7 @@ def make_obs_ph(name): else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None - td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) + td_errors = train(np.expand_dims(obses_t, axis=1), actions, rewards, np.expand_dims(obses_tp1, axis=1), dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) diff --git a/train_defeat_zerglings.py b/train_defeat_zerglings.py index 5eb3d79..279e2ba 100644 --- a/train_defeat_zerglings.py +++ b/train_defeat_zerglings.py @@ -92,13 +92,13 @@ def main(): q_func=model, num_actions=3, lr=1e-4, - max_timesteps=10000000, - buffer_size=100000, + max_timesteps=1000000, + buffer_size=10000, exploration_fraction=0.5, exploration_final_eps=0.01, train_freq=2, - learning_starts=100000, - target_network_update_freq=1000, + learning_starts=10000, + target_network_update_freq=100, gamma=0.99, prioritized_replay=True, callback=deepq_callback diff --git a/train_mineral_shards.py b/train_mineral_shards.py index 3661718..ddbd78d 100644 --- a/train_mineral_shards.py +++ b/train_mineral_shards.py @@ -33,7 +33,7 @@ flags.DEFINE_boolean("prioritized", True, "prioritized_replay") flags.DEFINE_boolean("dueling", True, "dueling") flags.DEFINE_float("lr", 0.0005, "Learning rate") -flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C") +flags.DEFINE_integer("num_agents", 6, "number of RL agents for A2C") flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C") flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C") From 39a68926a7e72e18067409f12d8da98621a9d6ae Mon Sep 17 00:00:00 2001 From: rwill128 Date: Thu, 25 Feb 2021 16:17:30 -0500 Subject: [PATCH 04/11] Pulling out a2c functionality for readability and debugging. --- a2c/a2c.py | 48 +------------- train_mineral_shards.py | 10 +-- train_mineral_shards_a2c.py | 129 ++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 54 deletions(-) create mode 100644 train_mineral_shards_a2c.py diff --git a/a2c/a2c.py b/a2c/a2c.py index 5e2dbfa..68afcc4 100644 --- a/a2c/a2c.py +++ b/a2c/a2c.py @@ -683,57 +683,11 @@ def learn(policy, for update in range(1, total_timesteps // nbatch + 1): obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run() - policy_loss, value_loss, policy_entropy, \ - policy_loss_xy0, policy_entropy_xy0, \ - policy_loss_xy1, policy_entropy_xy1, \ - = model.train(obs, states, td_targets, - masks, actions, - xy0, xy1, values) + policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) - if update % log_interval == 0 or update == 1: - ev = explained_variance(values, td_targets) - # nsml.report( - # nupdates=update, - # total_timesteps=update * nbatch, - # fps=fps, - # policy_entropy=float(policy_entropy), - # policy_loss=float(policy_loss), - - # policy_loss_xy0=float(policy_loss_xy0), - # policy_entropy_xy0=float(policy_entropy_xy0), - - # policy_loss_xy1=float(policy_loss_xy1), - # policy_entropy_xy1=float(policy_entropy_xy1), - - # value_loss=float(value_loss), - # explained_variance=float(ev), - - # batch_size=nbatch, - # step=update, - - # scope=locals() - # ) - # logger.record_tabular("nupdates", update) - # logger.record_tabular("total_timesteps", update * nbatch) - # logger.record_tabular("fps", fps) - # logger.record_tabular("policy_entropy", float(policy_entropy)) - # logger.record_tabular("policy_loss", float(policy_loss)) - - # logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0)) - # logger.record_tabular("policy_entropy_xy0", - # float(policy_entropy_xy0)) - # logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1)) - # logger.record_tabular("policy_entropy_xy1", - # float(policy_entropy_xy1)) - # # logger.record_tabular("policy_loss_y0", float(policy_loss_y0)) - # # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0)) - - # logger.record_tabular("value_loss", float(value_loss)) - # logger.record_tabular("explained_variance", float(ev)) - # logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): diff --git a/train_mineral_shards.py b/train_mineral_shards.py index ddbd78d..005377e 100644 --- a/train_mineral_shards.py +++ b/train_mineral_shards.py @@ -183,8 +183,7 @@ def deepq_callback(locals, globals): global max_mean_reward, last_filename if 'done' in locals and locals['done'] == True: - if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10\ - and locals['mean_100ep_reward'] > max_mean_reward: + if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward'] > max_mean_reward: print("mean_100ep_reward : %s max_mean_reward : %s" % (locals['mean_100ep_reward'], max_mean_reward)) @@ -258,8 +257,7 @@ def deepq_4way_callback(locals, globals): def a2c_callback(locals, globals): global max_mean_reward, last_filename - if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10\ - and locals['mean_100ep_reward'] > max_mean_reward: + if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward'] > max_mean_reward: print("mean_100ep_reward : %s max_mean_reward : %s" % (locals['mean_100ep_reward'], max_mean_reward)) @@ -280,9 +278,7 @@ def a2c_callback(locals, globals): max_mean_reward = locals['mean_100ep_reward'] model = locals['model'] - filename = os.path.join( - PROJ_DIR, - 'models/a2c/mineral_%s.pkl' % locals['mean_100ep_reward']) + filename = os.path.join(PROJ_DIR, 'models/a2c/mineral_%s.pkl' % locals['mean_100ep_reward']) model.save(filename) print("save best mean_100ep_reward model to %s" % filename) last_filename = filename diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py new file mode 100644 index 0000000..ba3e90f --- /dev/null +++ b/train_mineral_shards_a2c.py @@ -0,0 +1,129 @@ +import sys +import os +import datetime +import random + +from absl import flags + +from pysc2.env import sc2_env +from pysc2.lib import actions +from baselines_legacy import cnn_to_mlp, BatchInput +from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat + +from common.vec_env.subproc_vec_env import SubprocVecEnv +from a2c.policies import CnnPolicy +from a2c import a2c +import deepq_mineral_4way +import deepq_mineral_shards + +_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id +_SELECT_ARMY = actions.FUNCTIONS.select_army.id +_SELECT_ALL = [0] +_NOT_QUEUED = [0] + +step_mul = 8 + +FLAGS = flags.FLAGS +flags.DEFINE_string("map", "CollectMineralShards", + "Name of a map to use to play.") +start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") +flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)") +flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.") +flags.DEFINE_integer("timesteps", 2000000, "Steps to train") +flags.DEFINE_float("exploration_fraction", 0.5, "Exploration Fraction") +flags.DEFINE_boolean("prioritized", True, "prioritized_replay") +flags.DEFINE_boolean("dueling", True, "dueling") +flags.DEFINE_float("lr", 0.0005, "Learning rate") +flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C") +flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C") +flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C") + +PROJ_DIR = os.path.dirname(os.path.abspath(__file__)) + +max_mean_reward = 0 +last_filename = "" +logdir = "" + + +def main(): + FLAGS(sys.argv) + + print("algorithm : %s" % FLAGS.algorithm) + print("timesteps : %s" % FLAGS.timesteps) + print("exploration_fraction : %s" % FLAGS.exploration_fraction) + print("prioritized : %s" % FLAGS.prioritized) + print("dueling : %s" % FLAGS.dueling) + print("num_agents : %s" % FLAGS.num_agents) + print("lr : %s" % FLAGS.lr) + + if FLAGS.lr == 0: + FLAGS.lr = random.uniform(0.00001, 0.001) + print("random lr : %s" % FLAGS.lr) + lr_round = round(FLAGS.lr, 8) + + logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) + + Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) + + num_timesteps = int(40e6) + num_timesteps //= 4 + + seed = 0 + + env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, + FLAGS.map) + + policy_fn = CnnPolicy + a2c.learn( + policy_fn, + env, + seed, + total_timesteps=num_timesteps, + nprocs=FLAGS.num_agents + FLAGS.num_scripts, + nscripts=FLAGS.num_scripts, + ent_coef=0.5, + nsteps=FLAGS.nsteps, + max_grad_norm=0.01, + callback=a2c_callback) + +from baselines import logger + +def a2c_callback(locals, globals): + global max_mean_reward, last_filename + + logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c']) + logger.record_tabular("num_episodes", locals['num_episodes']) + logger.record_tabular("environment_number", locals['env_num']) + logger.record_tabular("done", locals['done']) + + if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward: + print("mean_100ep_reward_a2c : %s max_mean_reward : %s" % + (locals['mean_100ep_reward_a2c'], max_mean_reward)) + max_mean_reward = locals['mean_100ep_reward_a2c'] + logger.record_tabular("max_mean_reward", max_mean_reward) + + if not os.path.exists(os.path.join(PROJ_DIR, 'models/a2c/')): + try: + os.mkdir(os.path.join(PROJ_DIR, 'models/')) + except Exception as e: + print(str(e)) + try: + os.mkdir(os.path.join(PROJ_DIR, 'models/a2c/')) + except Exception as e: + print(str(e)) + + if last_filename != "": + os.remove(last_filename) + print("delete last model file : %s" % last_filename) + + model = locals['model'] + + filename = os.path.join(PROJ_DIR, 'models/a2c/mineral_%s.pkl' % locals['mean_100ep_reward_a2c']) + model.save(filename) + print("save best mean_100ep_reward model to %s" % filename) + last_filename = filename + + logger.dump_tabular() + +if __name__ == '__main__': + main() From 3ef6fa33931c67331a75160f7f4828a415bb0842 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Thu, 25 Feb 2021 20:58:07 -0500 Subject: [PATCH 05/11] Making things more readable --- a2c/a2c.py | 419 +++++++----------------------------- train_mineral_shards_a2c.py | 13 +- 2 files changed, 85 insertions(+), 347 deletions(-) diff --git a/a2c/a2c.py b/a2c/a2c.py index 68afcc4..76954a4 100644 --- a/a2c/a2c.py +++ b/a2c/a2c.py @@ -5,18 +5,14 @@ import tensorflow as tf from baselines import logger -from baselines.common import set_global_seeds, explained_variance +from baselines.common import set_global_seeds from baselines.a2c.utils import discount_with_dones from baselines.a2c.utils import Scheduler, find_trainable_variables from baselines.a2c.utils import cat_entropy -# from a2c import kfac -from pysc2.env import environment from pysc2.lib import actions as sc2_actions -from common import common - import nsml _CONTROL_GROUP_RECALL = 0 @@ -30,70 +26,34 @@ def mse(pred, target): class Model(object): - def __init__(self, - policy, - ob_space, - ac_space, - nenvs, - total_timesteps, - nprocs=32, - nscripts=16, - nsteps=20, - nstack=4, - ent_coef=0.1, - vf_coef=0.5, - vf_fisher_coef=1.0, - lr=0.25, - max_grad_norm=0.001, - kfac_clip=0.001, - lrschedule='linear', - alpha=0.99, - epsilon=1e-5): - config = tf.ConfigProto( - allow_soft_placement=True, - intra_op_parallelism_threads=nprocs, - inter_op_parallelism_threads=nprocs) + def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, lr=0.25, max_grad_norm=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): + config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) - # nact = ac_space.n nbatch = nenvs * nsteps - A = tf.placeholder(tf.int32, [nbatch]) - - XY0 = tf.placeholder(tf.int32, [nbatch]) - XY1 = tf.placeholder(tf.int32, [nbatch]) + a = tf.placeholder(tf.int32, [nbatch]) - # ADV == TD_TARGET - values - ADV = tf.placeholder(tf.float32, [nbatch]) - TD_TARGET = tf.placeholder(tf.float32, [nbatch]) - PG_LR = tf.placeholder(tf.float32, []) - VF_LR = tf.placeholder(tf.float32, []) + xy0 = tf.placeholder(tf.int32, [nbatch]) + xy1 = tf.placeholder(tf.int32, [nbatch]) - self.model = step_model = policy( - sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) - self.model2 = train_model = policy( - sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) + adv = tf.placeholder(tf.float32, [nbatch]) + td_target = tf.placeholder(tf.float32, [nbatch]) + pg_lr = tf.placeholder(tf.float32, []) - # Policy 1 : Base Action : train_model.pi label = A + self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) + self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) - script_mask = tf.concat( - [ - tf.zeros([nscripts * nsteps, 1]), - tf.ones([(nprocs - nscripts) * nsteps, 1]) - ], - axis=0) + script_mask = tf.concat([tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1])], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 - pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) - neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=pi, labels=A) + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(a, depth=3), axis=1) + neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=a) neglogpac *= tf.stop_gradient(pac_weight) - inv_A = 1.0 - tf.cast(A, tf.float32) - - xy0_mask = tf.cast(A, tf.float32) - xy1_mask = tf.cast(A, tf.float32) + xy0_mask = tf.cast(a, tf.float32) + xy1_mask = tf.cast(a, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) @@ -102,46 +62,32 @@ def __init__(self, condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) - # One hot representation of chosen marine. - # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 - pac_weight = tf.reduce_sum( - pac_weight * tf.one_hot(XY0, depth=1024), axis=1) + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1) - logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=pi_xy0, labels=XY0) + logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy0, labels=xy0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 - pac_weight = tf.reduce_sum( - pac_weight * tf.one_hot(XY0, depth=1024), axis=1) + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1) - # 1D? 2D? - logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=pi_xy1, labels=XY1) + logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy1, labels=xy1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) - pg_loss = tf.reduce_mean(ADV * neglogpac) - pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) - pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) + pg_loss = tf.reduce_mean(adv * neglogpac) + pg_loss_xy0 = tf.reduce_mean(adv * logpac_xy0) + pg_loss_xy1 = tf.reduce_mean(adv * logpac_xy1) vf_ = tf.squeeze(train_model.vf) - vf_r = tf.concat( - [ - tf.ones([nscripts * nsteps, 1]), - tf.zeros([(nprocs - nscripts) * nsteps, 1]) - ], - axis=0) * TD_TARGET + vf_r = tf.concat([tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1])], axis=0) * td_target vf_masked = vf_ * script_mask + vf_r - # vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] - - vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) + vf_loss = tf.reduce_mean(mse(vf_masked, td_target)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) @@ -154,48 +100,33 @@ def __init__(self, if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) - trainer = tf.train.RMSPropOptimizer( - learning_rate=lr, decay=alpha, epsilon=epsilon) + trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) - self.logits = logits = train_model.pi - - # xy0 + self.logits = train_model.pi - self.params_common = params_common = tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') - self.params_xy0 = params_xy0 = tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, - scope='model/xy0') + params_common + self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') + self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss - self.grads_check_xy0 = grads_xy0 = tf.gradients( - train_loss_xy0, params_xy0) + self.grads_check_xy0 = grads_xy0 = tf.gradients(train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) - trainer_xy0 = tf.train.RMSPropOptimizer( - learning_rate=lr, decay=alpha, epsilon=epsilon) + trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) - # xy1 - - self.params_xy1 = params_xy1 = tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, - scope='model/xy1') + params_common - + self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss - self.grads_check_xy1 = grads_xy1 = tf.gradients( - train_loss_xy1, params_xy1) + self.grads_check_xy1 = grads_xy1 = tf.gradients(train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) - trainer_xy1 = tf.train.RMSPropOptimizer( - learning_rate=lr, decay=alpha, epsilon=epsilon) + trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) @@ -207,27 +138,19 @@ def train(obs, states, td_targets, masks, actions, xy0, xy1, values): td_map = { train_model.X: obs, - A: actions, - XY0: xy0, - XY1: xy1, - ADV: advs, - TD_TARGET: td_targets, - PG_LR: cur_lr + a: actions, + xy0: xy0, + xy1: xy1, + adv: advs, + td_target: td_targets, + pg_lr: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks - policy_loss, value_loss, policy_entropy, _, \ - policy_loss_xy0, policy_entropy_xy0, _, \ - policy_loss_xy1, policy_entropy_xy1, _ = sess.run( - [pg_loss, vf_loss, entropy, _train, - pg_loss_xy0, entropy_xy0, _train_xy0, - pg_loss_xy1, entropy_xy1, _train_xy1], - td_map) - return policy_loss, value_loss, policy_entropy, \ - policy_loss_xy0, policy_entropy_xy0, \ - policy_loss_xy1, policy_entropy_xy1 + policy_loss, value_loss, policy_entropy, _, policy_loss_xy0, policy_entropy_xy0, _, policy_loss_xy1, policy_entropy_xy1, _ = sess.run([pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) + return policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) @@ -254,14 +177,7 @@ def load(load_path): class Runner(object): - def __init__(self, - env, - model, - nsteps, - nscripts, - nstack, - gamma, - callback=None): + def __init__(self, env, model, nsteps, nscripts, nstack, gamma, callback=None): self.env = env self.model = model nh, nw, nc = (32, 32, 3) @@ -280,7 +196,7 @@ def __init__(self, self.army_counts = army_counts self.control_groups = control_groups self.selected = selected - self.update_obs(obs) # (2,13,32,32) + self.update_obs(obs) self.update_available(available_actions) self.gamma = gamma self.states = model.initial_state @@ -300,13 +216,12 @@ def __init__(self, self.group_id = [0 for _ in range(nenv)] - def update_obs(self, obs): # (self.nenv, 32, 32, 2) + def update_obs(self, obs): obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) self.obs = np.roll(self.obs, shift=-3, axis=3) new_map = np.zeros((self.nenv, 32, 32, 3)) new_map[:, :, :, -1] = obs[:, 0, :, :] for env_num in range(self.nenv): - # print("xy_per_marine: ", self.xy_per_marine) if "0" not in self.xy_per_marine[env_num]: self.xy_per_marine[env_num]["0"] = [0, 0] if "1" not in self.xy_per_marine[env_num]: @@ -317,39 +232,29 @@ def update_obs(self, obs): # (self.nenv, 32, 32, 2) new_map[env_num, marine0[0], marine0[1], -3] = 1 new_map[env_num, marine1[0], marine1[1], -2] = 1 self.obs[:, :, :, -3:] = new_map - # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32) def update_available(self, _available_actions): - # print("update_available : ", _available_actions) self.available_actions = _available_actions - # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]]) self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8) for env_num, list in enumerate(_available_actions): - # print("env_num :", env_num, " list :", list) for action_num in list: - # print("action_num :", action_num) - if (action_num == 4): + if action_num == 4: self.base_act_mask[env_num][0] = 1 self.base_act_mask[env_num][1] = 1 elif action_num == 0: self.base_act_mask[env_num][2] = 1 - # elif(action_num == 331): - # self.base_act_mask[env_num][2] = 1 def valid_base_action(self, base_actions): for env_num, list in enumerate(self.available_actions): avail = [] for action_num in list: - if (action_num == 4): + if action_num == 4: avail.append(0) avail.append(1) elif action_num == 0: avail.append(2) - # elif(action_num == 331): - # avail.append(2) if base_actions[env_num] not in avail: - # print("env_num", env_num, " argmax is not valid. random pick ", avail) base_actions[env_num] = np.random.choice(avail) return base_actions @@ -357,99 +262,44 @@ def valid_base_action(self, base_actions): def trans_base_actions(self, base_actions): new_base_actions = np.copy(base_actions) for env_num, ba in enumerate(new_base_actions): - if (ba == 0): - new_base_actions[env_num] = 4 # move marine control group 0 - elif (ba == 1): - new_base_actions[env_num] = 4 # move marine control group 1 - elif (ba == 2): - new_base_actions[env_num] = 0 # move marine control group 1 - # elif(ba==2): - # new_base_actions[env_num] = 331 # move marine xy0 + if ba == 0: + new_base_actions[env_num] = 4 + elif ba == 1: + new_base_actions[env_num] = 4 + elif ba == 2: + new_base_actions[env_num] = 0 return new_base_actions def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1): actions = [] for env_num, spec in enumerate(base_action_spec): - # print("spec", spec.args) - args = [] - # for arg_idx, arg in enumerate(spec.args): - # #print("arg", arg) - # #print("arg.id", arg.id) - # if(arg.id==0): # screen (32,32) x0, y0 - # args.append([int(x0[env_num]), int(y0[env_num])]) - # # elif(arg.id==1): # minimap (32,32) x1, y1 - # # args.append([int(x1[env_num]), int(y1[env_num])]) - # # elif(arg.id==2): # screen2 (32,32) x2, y2 - # # args.append([int(x2[env_num]), y2[env_num]]) - # elif(arg.id==3): # pi3 queued (2) - # args.append([int(0)]) - # elif(arg.id==4): # pi4 control_group_act (5) - # args.append([_CONTROL_GROUP_RECALL]) - # elif(arg.id==5): # pi5 control_group_id 10 - # args.append([int(base_actions[env_num])]) # 0 => cg 0 / 1 => cg 1 - # # elif(arg.id==6): # pi6 select_point_act 4 - # # args.append([int(sub6[env_num])]) - # # elif(arg.id==7): # pi7 select_add 2 - # # args.append([int(sub7[env_num])]) - # # elif(arg.id==8): # pi8 select_unit_act 4 - # # args.append([int(sub8[env_num])]) - # # elif(arg.id==9): # pi9 select_unit_id 500 - # # args.append([int(sub9[env_num])]) - # # elif(arg.id==10): # pi10 select_worker 4 - # # args.append([int(sub10[env_num])]) - # # elif(arg.id==11): # pi11 build_queue_id 10 - # # args.append([int(sub11[env_num])]) - # # elif(arg.id==12): # pi12 unload_id 500 - # # args.append([int(sub12[env_num])]) - # else: - # raise NotImplementedError("cannot construct this arg", spec.args) two_action = [] if base_actions[env_num] == 0: - two_action.append( - sc2_actions.FunctionCall( - 4, - [[_CONTROL_GROUP_RECALL], [0]] - )) - - two_action.append( - sc2_actions.FunctionCall( - 331, - [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]])) + two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [0]])) + two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]])) elif base_actions[env_num] == 1: - two_action.append( - sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]])) - two_action.append( - sc2_actions.FunctionCall( - 331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]])) + two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]])) + two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]])) elif base_actions[env_num] == 2: - two_action.append( - sc2_actions.FunctionCall(0, [])) - two_action.append( - sc2_actions.FunctionCall(0, [])) + two_action.append(sc2_actions.FunctionCall(0, [])) + two_action.append(sc2_actions.FunctionCall(0, [])) - # action = sc2_actions.FunctionCall(a, args) actions.append(two_action) return actions def run(self): - mb_obs, mb_td_targets, mb_base_actions, \ - mb_xy0, mb_xy1, \ - mb_values, mb_dones \ - = [], [], [], [], [], [], [] + mb_obs, mb_td_targets, mb_base_actions, mb_xy0, mb_xy1, mb_values, mb_dones = [], [], [], [], [], [], [] mb_states = self.states for n in range(self.nsteps): - # pi, pi2, x1, y1, x2, y2, v0 - pi1, pi_xy0, pi_xy1, values, states = self.model.step( - self.obs, self.states, self.dones) + pi1, pi_xy0, pi_xy1, values, states = self.model.step(self.obs, self.states, self.dones) pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 - base_actions = np.argmax( - pi1 * self.base_act_mask + pi1_noise, axis=1) + base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise, axis=1) xy0 = np.argmax(pi_xy0, axis=1) x0 = (xy0 % 32).astype(int) @@ -459,59 +309,11 @@ def run(self): x1 = (xy1 % 32).astype(int) y1 = (xy1 / 32).astype(int) - # Scripted Agent Hacking - - for env_num in range(self.nenv): - if env_num >= self.nscripts: # only for scripted agents - continue - - ob = self.obs[env_num, :, :, :] - player_relative = ob[:, :, -1] - - self.group_list[env_num] = common.update_group_list2( - self.control_groups[env_num]) - - if len(self.action_queue[env_num]) == 0: - self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], \ - self.xy_per_marine[env_num] = \ - common.solve_tsp(player_relative, - self.selected[env_num][0], - self.group_list[env_num], - self.group_id[env_num], - self.dest_per_marine[env_num], - self.xy_per_marine[env_num]) - - base_actions[env_num] = 0 - x0[env_num] = 0 - y0[env_num] = 0 - x1[env_num] = 0 - y1[env_num] = 0 - - if len(self.action_queue[env_num]) > 0: - action = self.action_queue[env_num].pop(0) - base_actions[env_num] = action.get("base_action", 0) - - x0[env_num] = action.get("x0", 0) - y0[env_num] = action.get("y0", 0) - xy0[env_num] = y0[env_num] * 32 + x0[env_num] - - x1[env_num] = action.get("x1", 0) - y1[env_num] = action.get("y1", 0) - xy1[env_num] = y1[env_num] * 32 + x1[env_num] - base_actions = self.valid_base_action(base_actions) new_base_actions = self.trans_base_actions(base_actions) base_action_spec = self.env.action_spec(new_base_actions) - # print("base_actions:", base_actions) - actions = self.construct_action( - base_actions, - base_action_spec, - x0, - y0, - x1, - y1 - ) + actions = self.construct_action(base_actions, base_action_spec, x0, y0, x1, y1) mb_obs.append(np.copy(self.obs)) mb_base_actions.append(base_actions) @@ -521,12 +323,7 @@ def run(self): mb_values.append(values) mb_dones.append(self.dones) - # print("final acitons : ", actions) - obs, rewards, dones, \ - available_actions, army_counts, \ - control_groups, selected, xy_per_marine \ - = self.env.step( - actions=actions) + obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(actions=actions) self.army_counts = army_counts self.control_groups = control_groups self.selected = selected @@ -546,36 +343,11 @@ def run(self): self.episode_rewards.append(self.total_reward[n]) model = self.model - mean_100ep_reward = round( - np.mean(self.episode_rewards[-101:]), 1) - if (n < self.nscripts): # scripted agents - self.episode_rewards_script.append( - self.total_reward[n]) - mean_100ep_reward_script = round( - np.mean(self.episode_rewards_script[-101:]), 1) - nsml.report( - reward_script=self.total_reward[n], - mean_reward_script=mean_100ep_reward_script, - reward=self.total_reward[n], - mean_100ep_reward=mean_100ep_reward, - episodes=self.episodes, - step=self.episodes, - scope=locals() - ) - else: - self.episode_rewards_a2c.append(self.total_reward[n]) - mean_100ep_reward_a2c = round( - np.mean(self.episode_rewards_a2c[-101:]), 1) - nsml.report( - reward_a2c=self.total_reward[n], - mean_reward_a2c=mean_100ep_reward_a2c, - reward=self.total_reward[n], - mean_100ep_reward=mean_100ep_reward, - episodes=self.episodes, - step=self.episodes, - scope=locals() - ) - print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) + mean_100ep_reward = round(np.mean(self.episode_rewards[-101:]), 1) + self.episode_rewards_a2c.append(self.total_reward[n]) + mean_100ep_reward_a2c = round(np.mean(self.episode_rewards_a2c[-101:]), 1) + nsml.report(reward_a2c=self.total_reward[n], mean_reward_a2c=mean_100ep_reward_a2c, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals()) + print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) if self.callback is not None: self.callback(locals(), globals()) @@ -585,13 +357,9 @@ def run(self): self.update_obs(obs) mb_td_targets.append(rewards) mb_dones.append(self.dones) - # batch of steps to batch of rollouts - mb_obs = np.asarray( - mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( - self.batch_ob_shape) + mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) - mb_base_actions = np.asarray( - mb_base_actions, dtype=np.int32).swapaxes(1, 0) + mb_base_actions = np.asarray(mb_base_actions, dtype=np.int32).swapaxes(1, 0) mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) @@ -600,16 +368,13 @@ def run(self): mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] - last_values = self.model.value(self.obs, self.states, - self.dones).tolist() - # discount/bootstrap off value fn - for n, (rewards, dones, value) in enumerate( - zip(mb_td_targets, mb_dones, last_values)): + last_values = self.model.value(self.obs, self.states, self.dones).tolist() + + for n, (rewards, dones, value) in enumerate(zip(mb_td_targets, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: - rewards = discount_with_dones(rewards + [value], dones + [0], - self.gamma)[:-1] + rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_td_targets[n] = rewards @@ -620,8 +385,7 @@ def run(self): mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() - return mb_obs, mb_states, mb_td_targets, mb_masks, \ - mb_base_actions, mb_xy0, mb_xy1, mb_values + return mb_obs, mb_states, mb_td_targets, mb_masks, mb_base_actions, mb_xy0, mb_xy1, mb_values def learn(policy, @@ -629,17 +393,14 @@ def learn(policy, seed, total_timesteps=int(40e6), gamma=0.99, - log_interval=1, nprocs=24, nscripts=12, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, - vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.01, - kfac_clip=0.001, save_interval=None, lrschedule='linear', callback=None): @@ -649,19 +410,7 @@ def learn(policy, nenvs = nprocs ob_space = (32, 32, 3) # env.observation_space ac_space = (32, 32) - make_model = lambda: Model(policy, ob_space, ac_space, nenvs, - total_timesteps, - nprocs=nprocs, - nscripts=nscripts, - nsteps=nsteps, - nstack=nstack, - ent_coef=ent_coef, - vf_coef=vf_coef, - vf_fisher_coef=vf_fisher_coef, - lr=lr, - max_grad_norm=max_grad_norm, - kfac_clip=kfac_clip, - lrschedule=lrschedule) + make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nscripts=nscripts, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, lr=lr, max_grad_norm=max_grad_norm, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle @@ -669,26 +418,14 @@ def learn(policy, fh.write(cloudpickle.dumps(make_model)) model = make_model() print("make_model complete!") - runner = Runner( - env, - model, - nsteps=nsteps, - nscripts=nscripts, - nstack=nstack, - gamma=gamma, - callback=callback) + runner = Runner(env, model, nsteps=nsteps, nscripts=nscripts, nstack=nstack, gamma=gamma, callback=callback) nbatch = nenvs * nsteps - tstart = time.time() - # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) for update in range(1, total_timesteps // nbatch + 1): obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run() policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values) model.old_obs = obs - nseconds = time.time() - tstart - fps = int((update * nbatch) / nseconds) - if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py index ba3e90f..aa0f6a9 100644 --- a/train_mineral_shards_a2c.py +++ b/train_mineral_shards_a2c.py @@ -5,16 +5,12 @@ from absl import flags -from pysc2.env import sc2_env from pysc2.lib import actions -from baselines_legacy import cnn_to_mlp, BatchInput -from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat +from baselines.logger import Logger, TensorBoardOutputFormat from common.vec_env.subproc_vec_env import SubprocVecEnv from a2c.policies import CnnPolicy from a2c import a2c -import deepq_mineral_4way -import deepq_mineral_shards _MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id _SELECT_ARMY = actions.FUNCTIONS.select_army.id @@ -79,11 +75,16 @@ def main(): env, seed, total_timesteps=num_timesteps, + gamma=0.99, nprocs=FLAGS.num_agents + FLAGS.num_scripts, nscripts=FLAGS.num_scripts, - ent_coef=0.5, nsteps=FLAGS.nsteps, + ent_coef=0.5, + vf_coef=0.5, + lr=0.25, max_grad_norm=0.01, + save_interval=1000, + lrschedule='linear', callback=a2c_callback) from baselines import logger From b67106fb21aaa08843e2d8ccb65cf0281a16d1dc Mon Sep 17 00:00:00 2001 From: rwill128 Date: Tue, 2 Mar 2021 16:29:12 -0500 Subject: [PATCH 06/11] Adding a couple changes before I delete and reclone, because having issues with Conda env. --- a2c/__init__.py | 0 a2c/a2c.py | 414 +++--------------------------------- a2c/model.py | 178 ++++++++++++++++ a2c/runner.py | 253 ++++++++++++++++++++++ train_mineral_shards_a2c.py | 9 + 5 files changed, 472 insertions(+), 382 deletions(-) create mode 100644 a2c/__init__.py create mode 100644 a2c/model.py create mode 100644 a2c/runner.py diff --git a/a2c/__init__.py b/a2c/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/a2c/a2c.py b/a2c/a2c.py index 76954a4..61f1e6c 100644 --- a/a2c/a2c.py +++ b/a2c/a2c.py @@ -1,406 +1,32 @@ import os.path as osp import time -import joblib import numpy as np import tensorflow as tf from baselines import logger from baselines.common import set_global_seeds -from baselines.a2c.utils import discount_with_dones -from baselines.a2c.utils import Scheduler, find_trainable_variables -from baselines.a2c.utils import cat_entropy - -from pysc2.lib import actions as sc2_actions - -import nsml - -_CONTROL_GROUP_RECALL = 0 -_NOT_QUEUED = 0 +from a2c.model import Model +from a2c.runner import Runner np.set_printoptions(threshold=np.inf) - -def mse(pred, target): - return tf.square(pred - target) / 2. - - -class Model(object): - def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, lr=0.25, max_grad_norm=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): - config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) - config.gpu_options.allow_growth = True - self.sess = sess = tf.Session(config=config) - nsml.bind(sess=sess) - nbatch = nenvs * nsteps - a = tf.placeholder(tf.int32, [nbatch]) - - xy0 = tf.placeholder(tf.int32, [nbatch]) - xy1 = tf.placeholder(tf.int32, [nbatch]) - - adv = tf.placeholder(tf.float32, [nbatch]) - td_target = tf.placeholder(tf.float32, [nbatch]) - pg_lr = tf.placeholder(tf.float32, []) - - self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) - self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) - - script_mask = tf.concat([tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1])], axis=0) - - pi = train_model.pi - pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 - pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(a, depth=3), axis=1) - neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=a) - neglogpac *= tf.stop_gradient(pac_weight) - - xy0_mask = tf.cast(a, tf.float32) - xy1_mask = tf.cast(a, tf.float32) - - condition0 = tf.equal(xy0_mask, 2) - xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) - xy0_mask = 1.0 - xy0_mask - - condition1 = tf.equal(xy1_mask, 2) - xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) - - pi_xy0 = train_model.pi_xy0 - pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 - pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1) - - logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy0, labels=xy0) - logpac_xy0 *= tf.stop_gradient(pac_weight) - logpac_xy0 *= tf.cast(xy0_mask, tf.float32) - - pi_xy1 = train_model.pi_xy1 - pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 - pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1) - - logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy1, labels=xy1) - logpac_xy1 *= tf.stop_gradient(pac_weight) - logpac_xy1 *= tf.cast(xy1_mask, tf.float32) - - pg_loss = tf.reduce_mean(adv * neglogpac) - pg_loss_xy0 = tf.reduce_mean(adv * logpac_xy0) - pg_loss_xy1 = tf.reduce_mean(adv * logpac_xy1) - - vf_ = tf.squeeze(train_model.vf) - - vf_r = tf.concat([tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1])], axis=0) * td_target - vf_masked = vf_ * script_mask + vf_r - - vf_loss = tf.reduce_mean(mse(vf_masked, td_target)) - entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) - entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) - entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) - entropy = entropy_a + entropy_xy0 + entropy_xy1 - - loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef - - params = find_trainable_variables("model") - grads = tf.gradients(loss, params) - if max_grad_norm is not None: - grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) - grads = list(zip(grads, params)) - trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) - _train = trainer.apply_gradients(grads) - - self.logits = train_model.pi - - self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') - self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common - - train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss - - self.grads_check_xy0 = grads_xy0 = tf.gradients(train_loss_xy0, params_xy0) - if max_grad_norm is not None: - grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) - - grads_xy0 = list(zip(grads_xy0, params_xy0)) - trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) - _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) - - self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common - train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss - - self.grads_check_xy1 = grads_xy1 = tf.gradients(train_loss_xy1, params_xy1) - if max_grad_norm is not None: - grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) - - grads_xy1 = list(zip(grads_xy1, params_xy1)) - trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) - _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) - - self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) - - def train(obs, states, td_targets, masks, actions, xy0, xy1, values): - advs = td_targets - values - for step in range(len(obs)): - cur_lr = self.lr.value() - - td_map = { - train_model.X: obs, - a: actions, - xy0: xy0, - xy1: xy1, - adv: advs, - td_target: td_targets, - pg_lr: cur_lr - } - if states != []: - td_map[train_model.S] = states - td_map[train_model.M] = masks - - policy_loss, value_loss, policy_entropy, _, policy_loss_xy0, policy_entropy_xy0, _, policy_loss_xy1, policy_entropy_xy1, _ = sess.run([pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) - return policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1 - - def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) - - def load(load_path): - loaded_params = joblib.load(load_path) - restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) - sess.run(restores) - - self.train = train - self.save = save - self.load = load - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.value = step_model.value - self.initial_state = step_model.initial_state - print("global_variables_initializer start") - tf.global_variables_initializer().run(session=sess) - print("global_variables_initializer complete") - - -class Runner(object): - def __init__(self, env, model, nsteps, nscripts, nstack, gamma, callback=None): - self.env = env - self.model = model - nh, nw, nc = (32, 32, 3) - self.nsteps = nsteps - self.nscripts = nscripts - self.nenv = nenv = env.num_envs - self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack) - self.batch_coord_shape = (nenv * nsteps, 32) - self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) - self.available_actions = None - self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8) - obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset() - self.xy_per_marine = [{"0": [0, 0], "1": [0, 0]} for _ in range(nenv)] - for env_num, data in enumerate(xy_per_marine): - self.xy_per_marine[env_num] = data - self.army_counts = army_counts - self.control_groups = control_groups - self.selected = selected - self.update_obs(obs) - self.update_available(available_actions) - self.gamma = gamma - self.states = model.initial_state - self.dones = [False for _ in range(nenv)] - self.total_reward = [0.0 for _ in range(nenv)] - self.episode_rewards = [] - self.episode_rewards_script = [] - self.episode_rewards_a2c = [] - self.episodes = 0 - self.steps = 0 - self.callback = callback - - self.action_queue = [[] for _ in range(nenv)] - self.group_list = [[] for _ in range(nenv)] - self.agent_state = ["IDLE" for _ in range(nenv)] - self.dest_per_marine = [{} for _ in range(nenv)] - - self.group_id = [0 for _ in range(nenv)] - - def update_obs(self, obs): - obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) - self.obs = np.roll(self.obs, shift=-3, axis=3) - new_map = np.zeros((self.nenv, 32, 32, 3)) - new_map[:, :, :, -1] = obs[:, 0, :, :] - for env_num in range(self.nenv): - if "0" not in self.xy_per_marine[env_num]: - self.xy_per_marine[env_num]["0"] = [0, 0] - if "1" not in self.xy_per_marine[env_num]: - self.xy_per_marine[env_num]["1"] = [0, 0] - - marine0 = self.xy_per_marine[env_num]["0"] - marine1 = self.xy_per_marine[env_num]["1"] - new_map[env_num, marine0[0], marine0[1], -3] = 1 - new_map[env_num, marine1[0], marine1[1], -2] = 1 - self.obs[:, :, :, -3:] = new_map - - def update_available(self, _available_actions): - self.available_actions = _available_actions - self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8) - for env_num, list in enumerate(_available_actions): - for action_num in list: - if action_num == 4: - self.base_act_mask[env_num][0] = 1 - self.base_act_mask[env_num][1] = 1 - elif action_num == 0: - self.base_act_mask[env_num][2] = 1 - - def valid_base_action(self, base_actions): - for env_num, list in enumerate(self.available_actions): - avail = [] - for action_num in list: - if action_num == 4: - avail.append(0) - avail.append(1) - elif action_num == 0: - avail.append(2) - - if base_actions[env_num] not in avail: - base_actions[env_num] = np.random.choice(avail) - - return base_actions - - def trans_base_actions(self, base_actions): - new_base_actions = np.copy(base_actions) - for env_num, ba in enumerate(new_base_actions): - if ba == 0: - new_base_actions[env_num] = 4 - elif ba == 1: - new_base_actions[env_num] = 4 - elif ba == 2: - new_base_actions[env_num] = 0 - - return new_base_actions - - def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1): - actions = [] - for env_num, spec in enumerate(base_action_spec): - two_action = [] - if base_actions[env_num] == 0: - two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [0]])) - two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]])) - - elif base_actions[env_num] == 1: - two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]])) - two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]])) - elif base_actions[env_num] == 2: - two_action.append(sc2_actions.FunctionCall(0, [])) - two_action.append(sc2_actions.FunctionCall(0, [])) - - actions.append(two_action) - - return actions - - def run(self): - mb_obs, mb_td_targets, mb_base_actions, mb_xy0, mb_xy1, mb_values, mb_dones = [], [], [], [], [], [], [] - - mb_states = self.states - for n in range(self.nsteps): - pi1, pi_xy0, pi_xy1, values, states = self.model.step(self.obs, self.states, self.dones) - - pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 - - base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise, axis=1) - xy0 = np.argmax(pi_xy0, axis=1) - - x0 = (xy0 % 32).astype(int) - y0 = (xy0 / 32).astype(int) - - xy1 = np.argmax(pi_xy1, axis=1) - x1 = (xy1 % 32).astype(int) - y1 = (xy1 / 32).astype(int) - - base_actions = self.valid_base_action(base_actions) - new_base_actions = self.trans_base_actions(base_actions) - - base_action_spec = self.env.action_spec(new_base_actions) - actions = self.construct_action(base_actions, base_action_spec, x0, y0, x1, y1) - - mb_obs.append(np.copy(self.obs)) - mb_base_actions.append(base_actions) - - mb_xy0.append(xy0) - mb_xy1.append(xy1) - mb_values.append(values) - mb_dones.append(self.dones) - - obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(actions=actions) - self.army_counts = army_counts - self.control_groups = control_groups - self.selected = selected - for env_num, data in enumerate(xy_per_marine): - self.xy_per_marine[env_num] = data - self.update_available(available_actions) - - self.states = states - self.dones = dones - mean_100ep_reward_a2c = 0 - for n, done in enumerate(dones): - self.total_reward[n] += float(rewards[n]) - if done: - self.obs[n] = self.obs[n] * 0 - self.episodes += 1 - num_episodes = self.episodes - self.episode_rewards.append(self.total_reward[n]) - - model = self.model - mean_100ep_reward = round(np.mean(self.episode_rewards[-101:]), 1) - self.episode_rewards_a2c.append(self.total_reward[n]) - mean_100ep_reward_a2c = round(np.mean(self.episode_rewards_a2c[-101:]), 1) - nsml.report(reward_a2c=self.total_reward[n], mean_reward_a2c=mean_100ep_reward_a2c, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals()) - print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) - - if self.callback is not None: - self.callback(locals(), globals()) - self.total_reward[n] = 0 - self.group_list[n] = [] - - self.update_obs(obs) - mb_td_targets.append(rewards) - mb_dones.append(self.dones) - mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) - mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) - mb_base_actions = np.asarray(mb_base_actions, dtype=np.int32).swapaxes(1, 0) - - mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) - mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) - - mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - mb_masks = mb_dones[:, :-1] - mb_dones = mb_dones[:, 1:] - last_values = self.model.value(self.obs, self.states, self.dones).tolist() - - for n, (rewards, dones, value) in enumerate(zip(mb_td_targets, mb_dones, last_values)): - rewards = rewards.tolist() - dones = dones.tolist() - if dones[-1] == 0: - rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] - else: - rewards = discount_with_dones(rewards, dones, self.gamma) - mb_td_targets[n] = rewards - mb_td_targets = mb_td_targets.flatten() - mb_base_actions = mb_base_actions.flatten() - mb_xy0 = mb_xy0.flatten() - mb_xy1 = mb_xy1.flatten() - - mb_values = mb_values.flatten() - mb_masks = mb_masks.flatten() - return mb_obs, mb_states, mb_td_targets, mb_masks, mb_base_actions, mb_xy0, mb_xy1, mb_values - - def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, + log_interval=1, nprocs=24, nscripts=12, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, + vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.01, + kfac_clip=0.001, save_interval=None, lrschedule='linear', callback=None): @@ -410,7 +36,19 @@ def learn(policy, nenvs = nprocs ob_space = (32, 32, 3) # env.observation_space ac_space = (32, 32) - make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nscripts=nscripts, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, lr=lr, max_grad_norm=max_grad_norm, lrschedule=lrschedule) + make_model = lambda: Model(policy, ob_space, ac_space, nenvs, + total_timesteps, + nprocs=nprocs, + nscripts=nscripts, + nsteps=nsteps, + nstack=nstack, + ent_coef=ent_coef, + vf_coef=vf_coef, + vf_fisher_coef=vf_fisher_coef, + lr=lr, + max_grad_norm=max_grad_norm, + kfac_clip=kfac_clip, + lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle @@ -418,14 +56,26 @@ def learn(policy, fh.write(cloudpickle.dumps(make_model)) model = make_model() print("make_model complete!") - runner = Runner(env, model, nsteps=nsteps, nscripts=nscripts, nstack=nstack, gamma=gamma, callback=callback) + runner = Runner( + env, + model, + nsteps=nsteps, + nscripts=nscripts, + nstack=nstack, + gamma=gamma, + callback=callback) nbatch = nenvs * nsteps + tstart = time.time() + # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) for update in range(1, total_timesteps // nbatch + 1): obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run() - policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values) + model.policy_loss, model.value_loss, model.policy_entropy, model.policy_loss_xy0, model.policy_entropy_xy0, model.policy_loss_xy1, model.policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values) model.old_obs = obs + nseconds = time.time() - tstart + fps = int((update * nbatch) / nseconds) + if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) diff --git a/a2c/model.py b/a2c/model.py new file mode 100644 index 0000000..76aeb9d --- /dev/null +++ b/a2c/model.py @@ -0,0 +1,178 @@ +import joblib +import tensorflow as tf +from baselines.a2c.utils import cat_entropy, find_trainable_variables, Scheduler + +import nsml + + +class Model(object): + def __init__(self, + policy, + ob_space, + ac_space, + nenvs, + total_timesteps, + nprocs=32, + nscripts=16, + nsteps=20, + nstack=4, + ent_coef=0.1, + vf_coef=0.5, + vf_fisher_coef=1.0, + lr=0.25, + max_grad_norm=0.001, + kfac_clip=0.001, + lrschedule='linear', + alpha=0.99, + epsilon=1e-5): + config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) + config.gpu_options.allow_growth = True + self.sess = sess = tf.Session(config=config) + nsml.bind(sess=sess) + nbatch = nenvs * nsteps + A = tf.placeholder(tf.int32, [nbatch]) + + XY0 = tf.placeholder(tf.int32, [nbatch]) + XY1 = tf.placeholder(tf.int32, [nbatch]) + + ADV = tf.placeholder(tf.float32, [nbatch]) + TD_TARGET = tf.placeholder(tf.float32, [nbatch]) + PG_LR = tf.placeholder(tf.float32, []) + + self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) + self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) + + script_mask = tf.concat([tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1])], axis=0) + + pi = train_model.pi + pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) + neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=A) + neglogpac *= tf.stop_gradient(pac_weight) + + xy0_mask = tf.cast(A, tf.float32) + xy1_mask = tf.cast(A, tf.float32) + + condition0 = tf.equal(xy0_mask, 2) + xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) + xy0_mask = 1.0 - xy0_mask + + condition1 = tf.equal(xy1_mask, 2) + xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) + + pi_xy0 = train_model.pi_xy0 + pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) + + logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy0, labels=XY0) + logpac_xy0 *= tf.stop_gradient(pac_weight) + logpac_xy0 *= tf.cast(xy0_mask, tf.float32) + + pi_xy1 = train_model.pi_xy1 + pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 + pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) + + logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy1, labels=XY1) + logpac_xy1 *= tf.stop_gradient(pac_weight) + logpac_xy1 *= tf.cast(xy1_mask, tf.float32) + + pg_loss = tf.reduce_mean(ADV * neglogpac) + pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) + pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) + + vf_ = tf.squeeze(train_model.vf) + + vf_r = tf.concat([tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1])], axis=0) * TD_TARGET + vf_masked = vf_ * script_mask + vf_r + + vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) + entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) + entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) + entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) + entropy = entropy_a + entropy_xy0 + entropy_xy1 + + loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + + params = find_trainable_variables("model") + grads = tf.gradients(loss, params) + if max_grad_norm is not None: + grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) + grads = list(zip(grads, params)) + trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) + _train = trainer.apply_gradients(grads) + + self.logits = train_model.pi + + self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') + self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common + + train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss + + self.grads_check_xy0 = grads_xy0 = tf.gradients(train_loss_xy0, params_xy0) + if max_grad_norm is not None: + grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) + + grads_xy0 = list(zip(grads_xy0, params_xy0)) + trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) + _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) + + self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common + train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss + + self.grads_check_xy1 = grads_xy1 = tf.gradients(train_loss_xy1, params_xy1) + if max_grad_norm is not None: + grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) + + grads_xy1 = list(zip(grads_xy1, params_xy1)) + trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) + _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) + + self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) + + def train(obs, states, td_targets, masks, actions, xy0, xy1, values): + advs = td_targets - values + for step in range(len(obs)): + cur_lr = self.lr.value() + + td_map = { + train_model.X: obs, + A: actions, + XY0: xy0, + XY1: xy1, + ADV: advs, + TD_TARGET: td_targets, + PG_LR: cur_lr + } + if states != []: + td_map[train_model.S] = states + td_map[train_model.M] = masks + + policy_loss, value_loss, policy_entropy, _, policy_loss_xy0, policy_entropy_xy0, _, policy_loss_xy1, policy_entropy_xy1, _ = sess.run([pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) + return policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1 + + def save(save_path): + ps = sess.run(params) + joblib.dump(ps, save_path) + + def load(load_path): + loaded_params = joblib.load(load_path) + restores = [] + for p, loaded_p in zip(params, loaded_params): + restores.append(p.assign(loaded_p)) + sess.run(restores) + + self.train = train + self.save = save + self.load = load + self.train_model = train_model + self.step_model = step_model + self.step = step_model.step + self.value = step_model.value + self.initial_state = step_model.initial_state + print("global_variables_initializer start") + tf.global_variables_initializer().run(session=sess) + print("global_variables_initializer complete") + + +def mse(pred, target): + return tf.square(pred - target) / 2. \ No newline at end of file diff --git a/a2c/runner.py b/a2c/runner.py new file mode 100644 index 0000000..723f781 --- /dev/null +++ b/a2c/runner.py @@ -0,0 +1,253 @@ +import numpy as np +from baselines.a2c.utils import discount_with_dones +from pysc2.lib import actions as sc2_actions + +import nsml + + +class Runner(object): + def __init__(self, env, model, nsteps, nscripts, nstack, gamma, callback=None): + self.env = env + self.model = model + nh, nw, nc = (32, 32, 3) + self.nsteps = nsteps + self.nscripts = nscripts + self.nenv = nenv = env.num_envs + self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack) + self.batch_coord_shape = (nenv * nsteps, 32) + self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) + self.available_actions = None + self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8) + obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset() + self.xy_per_marine = [{"0": [0, 0], "1": [0, 0]} for _ in range(nenv)] + for env_num, data in enumerate(xy_per_marine): + self.xy_per_marine[env_num] = data + self.army_counts = army_counts + self.control_groups = control_groups + self.selected = selected + self.update_obs(obs) + self.update_available(available_actions) + self.gamma = gamma + self.states = model.initial_state + self.dones = [False for _ in range(nenv)] + self.total_reward = [0.0 for _ in range(nenv)] + self.episode_rewards = [] + self.episode_rewards_script = [] + self.episode_rewards_a2c = [] + self.episodes = 0 + self.steps = 0 + self.callback = callback + + self.action_queue = [[] for _ in range(nenv)] + self.group_list = [[] for _ in range(nenv)] + self.agent_state = ["IDLE" for _ in range(nenv)] + self.dest_per_marine = [{} for _ in range(nenv)] + + self.group_id = [0 for _ in range(nenv)] + + def update_obs(self, obs): + obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3) + self.obs = np.roll(self.obs, shift=-3, axis=3) + new_map = np.zeros((self.nenv, 32, 32, 3)) + new_map[:, :, :, -1] = obs[:, 0, :, :] + for env_num in range(self.nenv): + if "0" not in self.xy_per_marine[env_num]: + self.xy_per_marine[env_num]["0"] = [0, 0] + if "1" not in self.xy_per_marine[env_num]: + self.xy_per_marine[env_num]["1"] = [0, 0] + + marine0 = self.xy_per_marine[env_num]["0"] + marine1 = self.xy_per_marine[env_num]["1"] + new_map[env_num, marine0[0], marine0[1], -3] = 1 + new_map[env_num, marine1[0], marine1[1], -2] = 1 + self.obs[:, :, :, -3:] = new_map + + def update_available(self, _available_actions): + self.available_actions = _available_actions + self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8) + for env_num, list in enumerate(_available_actions): + for action_num in list: + if action_num == 4: + self.base_act_mask[env_num][0] = 1 + self.base_act_mask[env_num][1] = 1 + elif action_num == 0: + self.base_act_mask[env_num][2] = 1 + + def valid_base_action(self, base_actions): + for env_num, list in enumerate(self.available_actions): + avail = [] + for action_num in list: + if action_num == 4: + avail.append(0) + avail.append(1) + elif action_num == 0: + avail.append(2) + + if base_actions[env_num] not in avail: + base_actions[env_num] = np.random.choice(avail) + + return base_actions + + def trans_base_actions(self, base_actions): + new_base_actions = np.copy(base_actions) + for env_num, ba in enumerate(new_base_actions): + if ba == 0: + new_base_actions[env_num] = 4 # move marine control group 0 + elif ba == 1: + new_base_actions[env_num] = 4 # move marine control group 1 + elif ba == 2: + new_base_actions[env_num] = 0 # move marine control group 1 + + return new_base_actions + + def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1): + actions = [] + for env_num, spec in enumerate(base_action_spec): + two_action = [] + if base_actions[env_num] == 0: + two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [0]])) + two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]])) + + elif base_actions[env_num] == 1: + two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]])) + two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]])) + elif base_actions[env_num] == 2: + two_action.append(sc2_actions.FunctionCall(0, [])) + two_action.append(sc2_actions.FunctionCall(0, [])) + + actions.append(two_action) + + return actions + + def run(self): + mb_obs, mb_td_targets, mb_base_actions, mb_xy0, mb_xy1, mb_values, mb_dones = [], [], [], [], [], [], [] + + mb_states = self.states + for n in range(self.nsteps): + pi1, pi_xy0, pi_xy1, values, states = self.model.step(self.obs, self.states, self.dones) + + pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3 + + base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise, axis=1) + xy0 = np.argmax(pi_xy0, axis=1) + + x0 = (xy0 % 32).astype(int) + y0 = (xy0 / 32).astype(int) + + xy1 = np.argmax(pi_xy1, axis=1) + x1 = (xy1 % 32).astype(int) + y1 = (xy1 / 32).astype(int) + + base_actions = self.valid_base_action(base_actions) + new_base_actions = self.trans_base_actions(base_actions) + + base_action_spec = self.env.action_spec(new_base_actions) + actions = self.construct_action(base_actions, base_action_spec, x0, y0, x1, y1) + + mb_obs.append(np.copy(self.obs)) + mb_base_actions.append(base_actions) + + mb_xy0.append(xy0) + mb_xy1.append(xy1) + mb_values.append(values) + mb_dones.append(self.dones) + + obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(actions=actions) + self.army_counts = army_counts + self.control_groups = control_groups + self.selected = selected + for env_num, data in enumerate(xy_per_marine): + self.xy_per_marine[env_num] = data + self.update_available(available_actions) + + self.states = states + self.dones = dones + mean_100ep_reward_a2c = 0 + for n, done in enumerate(dones): + self.total_reward[n] += float(rewards[n]) + if done: + self.obs[n] = self.obs[n] * 0 + self.episodes += 1 + num_episodes = self.episodes + self.episode_rewards.append(self.total_reward[n]) + + model = self.model + mean_100ep_reward = round( + np.mean(self.episode_rewards[-101:]), 1) + if n < self.nscripts: # scripted agents + self.episode_rewards_script.append( + self.total_reward[n]) + mean_100ep_reward_script = round( + np.mean(self.episode_rewards_script[-101:]), 1) + nsml.report( + reward_script=self.total_reward[n], + mean_reward_script=mean_100ep_reward_script, + reward=self.total_reward[n], + mean_100ep_reward=mean_100ep_reward, + episodes=self.episodes, + step=self.episodes, + scope=locals() + ) + else: + self.episode_rewards_a2c.append(self.total_reward[n]) + mean_100ep_reward_a2c = round( + np.mean(self.episode_rewards_a2c[-101:]), 1) + nsml.report( + reward_a2c=self.total_reward[n], + mean_reward_a2c=mean_100ep_reward_a2c, + reward=self.total_reward[n], + mean_100ep_reward=mean_100ep_reward, + episodes=self.episodes, + step=self.episodes, + scope=locals() + ) + print("mean_100ep_reward_a2c", mean_100ep_reward_a2c) + + if self.callback is not None: + self.callback(locals(), globals()) + self.total_reward[n] = 0 + self.group_list[n] = [] + + self.update_obs(obs) + mb_td_targets.append(rewards) + mb_dones.append(self.dones) + # batch of steps to batch of rollouts + mb_obs = np.asarray( + mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( + self.batch_ob_shape) + mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0) + mb_base_actions = np.asarray( + mb_base_actions, dtype=np.int32).swapaxes(1, 0) + + mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0) + mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0) + + mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) + mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) + mb_masks = mb_dones[:, :-1] + mb_dones = mb_dones[:, 1:] + last_values = self.model.value(self.obs, self.states, + self.dones).tolist() + # discount/bootstrap off value fn + for n, (rewards, dones, value) in enumerate( + zip(mb_td_targets, mb_dones, last_values)): + rewards = rewards.tolist() + dones = dones.tolist() + if dones[-1] == 0: + rewards = discount_with_dones(rewards + [value], dones + [0], + self.gamma)[:-1] + else: + rewards = discount_with_dones(rewards, dones, self.gamma) + mb_td_targets[n] = rewards + mb_td_targets = mb_td_targets.flatten() + mb_base_actions = mb_base_actions.flatten() + mb_xy0 = mb_xy0.flatten() + mb_xy1 = mb_xy1.flatten() + + mb_values = mb_values.flatten() + mb_masks = mb_masks.flatten() + return mb_obs, mb_states, mb_td_targets, mb_masks, mb_base_actions, mb_xy0, mb_xy1, mb_values + + +_CONTROL_GROUP_RECALL = 0 +_NOT_QUEUED = 0 \ No newline at end of file diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py index aa0f6a9..6517580 100644 --- a/train_mineral_shards_a2c.py +++ b/train_mineral_shards_a2c.py @@ -95,6 +95,15 @@ def a2c_callback(locals, globals): logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c']) logger.record_tabular("num_episodes", locals['num_episodes']) logger.record_tabular("environment_number", locals['env_num']) + logger.record_tabular("policy_loss", locals['model'].policy_loss) + logger.record_tabular("policy_loss_xy0", locals['model'].policy_loss_xy0) + logger.record_tabular("policy_loss_xy1", locals['model'].policy_loss_xy1) + logger.record_tabular("policy_entropy", locals['model'].policy_entropy) + logger.record_tabular("policy_entropy_xy0", locals['model'].policy_entropy_xy0) + logger.record_tabular("policy_entropy_xy1", locals['model'].policy_entropy_xy1) + logger.record_tabular("learning_rate_N", locals['model'].lr.n) + logger.record_tabular("learning_rate_V", locals['model'].lr.v) + logger.record_tabular("value_loss", locals['model'].value_loss) logger.record_tabular("done", locals['done']) if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward: From f08fa57c1e0824782568e8d1b2c917ed41505ed4 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Tue, 2 Mar 2021 16:30:57 -0500 Subject: [PATCH 07/11] Adding a couple changes before I delete and reclone, because having issues with Conda env. --- requirements.txt | 11 +++-- train_mineral_shards_a2c.py | 86 ++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 43 deletions(-) diff --git a/requirements.txt b/requirements.txt index 029d0f6..5a23461 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,12 @@ git+https://github.com/deepmind/pysc2 git+https://github.com/openai/baselines -numpy -tensorflow +numpy~=1.19.5 +tensorflow~=1.14.0 absl-py cloudpickle -dill \ No newline at end of file +dill~=0.3.3 +tensorflow-gpu~=1.14.0 +joblib~=1.0.1 +six~=1.15.0 +future~=0.18.2 +s2clientprotocol~=5.0.6.83830.0 \ No newline at end of file diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py index 6517580..ed0c835 100644 --- a/train_mineral_shards_a2c.py +++ b/train_mineral_shards_a2c.py @@ -3,36 +3,36 @@ import datetime import random -from absl import flags +import absl -from pysc2.lib import actions +import pysc2.lib from baselines.logger import Logger, TensorBoardOutputFormat -from common.vec_env.subproc_vec_env import SubprocVecEnv +import common.vec_env.subproc_vec_env +import a2c from a2c.policies import CnnPolicy -from a2c import a2c -_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id -_SELECT_ARMY = actions.FUNCTIONS.select_army.id +_MOVE_SCREEN = pysc2.lib.actions.FUNCTIONS.Move_screen.id +_SELECT_ARMY = pysc2.lib.actions.FUNCTIONS.select_army.id _SELECT_ALL = [0] _NOT_QUEUED = [0] step_mul = 8 -FLAGS = flags.FLAGS -flags.DEFINE_string("map", "CollectMineralShards", +FLAGS = absl.flags.FLAGS +absl.flags.DEFINE_string("map", "CollectMineralShards", "Name of a map to use to play.") start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") -flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)") -flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.") -flags.DEFINE_integer("timesteps", 2000000, "Steps to train") -flags.DEFINE_float("exploration_fraction", 0.5, "Exploration Fraction") -flags.DEFINE_boolean("prioritized", True, "prioritized_replay") -flags.DEFINE_boolean("dueling", True, "dueling") -flags.DEFINE_float("lr", 0.0005, "Learning rate") -flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C") -flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C") -flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C") +absl.flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)") +absl.flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.") +absl.flags.DEFINE_integer("timesteps", 2000000, "Steps to train") +absl.flags.DEFINE_float("exploration_fraction", 0.5, "Exploration Fraction") +absl.flags.DEFINE_boolean("prioritized", True, "prioritized_replay") +absl.flags.DEFINE_boolean("dueling", True, "dueling") +absl.flags.DEFINE_float("lr", 0.0005, "Learning rate") +absl.flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C") +absl.flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C") +absl.flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C") PROJ_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -57,7 +57,9 @@ def main(): print("random lr : %s" % FLAGS.lr) lr_round = round(FLAGS.lr, 8) - logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time) + logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( + FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, + start_time) Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) @@ -66,11 +68,11 @@ def main(): seed = 0 - env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, - FLAGS.map) + env = common.vec_env.subproc_vec_env.SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, + FLAGS.map) policy_fn = CnnPolicy - a2c.learn( + a2c.a2c.learn( policy_fn, env, seed, @@ -87,30 +89,33 @@ def main(): lrschedule='linear', callback=a2c_callback) -from baselines import logger + +import baselines + def a2c_callback(locals, globals): global max_mean_reward, last_filename - logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c']) - logger.record_tabular("num_episodes", locals['num_episodes']) - logger.record_tabular("environment_number", locals['env_num']) - logger.record_tabular("policy_loss", locals['model'].policy_loss) - logger.record_tabular("policy_loss_xy0", locals['model'].policy_loss_xy0) - logger.record_tabular("policy_loss_xy1", locals['model'].policy_loss_xy1) - logger.record_tabular("policy_entropy", locals['model'].policy_entropy) - logger.record_tabular("policy_entropy_xy0", locals['model'].policy_entropy_xy0) - logger.record_tabular("policy_entropy_xy1", locals['model'].policy_entropy_xy1) - logger.record_tabular("learning_rate_N", locals['model'].lr.n) - logger.record_tabular("learning_rate_V", locals['model'].lr.v) - logger.record_tabular("value_loss", locals['model'].value_loss) - logger.record_tabular("done", locals['done']) - - if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward: + baselines.logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c']) + baselines.logger.record_tabular("num_episodes", locals['num_episodes']) + baselines.logger.record_tabular("environment_number", locals['env_num']) + baselines.logger.record_tabular("policy_loss", locals['model'].policy_loss) + baselines.logger.record_tabular("policy_loss_xy0", locals['model'].policy_loss_xy0) + baselines.logger.record_tabular("policy_loss_xy1", locals['model'].policy_loss_xy1) + baselines.logger.record_tabular("policy_entropy", locals['model'].policy_entropy) + baselines.logger.record_tabular("policy_entropy_xy0", locals['model'].policy_entropy_xy0) + baselines.logger.record_tabular("policy_entropy_xy1", locals['model'].policy_entropy_xy1) + baselines.logger.record_tabular("learning_rate_N", locals['model'].lr.n) + baselines.logger.record_tabular("learning_rate_V", locals['model'].lr.v) + baselines.logger.record_tabular("value_loss", locals['model'].value_loss) + baselines.logger.record_tabular("done", locals['done']) + + if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals[ + 'mean_100ep_reward_a2c'] > max_mean_reward: print("mean_100ep_reward_a2c : %s max_mean_reward : %s" % (locals['mean_100ep_reward_a2c'], max_mean_reward)) max_mean_reward = locals['mean_100ep_reward_a2c'] - logger.record_tabular("max_mean_reward", max_mean_reward) + baselines.logger.record_tabular("max_mean_reward", max_mean_reward) if not os.path.exists(os.path.join(PROJ_DIR, 'models/a2c/')): try: @@ -133,7 +138,8 @@ def a2c_callback(locals, globals): print("save best mean_100ep_reward model to %s" % filename) last_filename = filename - logger.dump_tabular() + baselines.logger.dump_tabular() + if __name__ == '__main__': main() From 58cc917d522647fe825c74b59e8970c79e771c56 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Tue, 2 Mar 2021 17:16:41 -0500 Subject: [PATCH 08/11] Making corrections to requirements.txt --- requirements.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5a23461..ec3d62e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,7 @@ -git+https://github.com/deepmind/pysc2 -git+https://github.com/openai/baselines -numpy~=1.19.5 -tensorflow~=1.14.0 absl-py -cloudpickle +cloudpickle==1.2.0 dill~=0.3.3 -tensorflow-gpu~=1.14.0 +tensorflow-gpu==1.14.0 joblib~=1.0.1 six~=1.15.0 future~=0.18.2 From 9228ef3608b960ce810d1ef9ecd348c95d1a7005 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Tue, 2 Mar 2021 17:17:33 -0500 Subject: [PATCH 09/11] Making corrections to requirements.txt --- requirements.txt | 54 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index ec3d62e..f801d25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,50 @@ -absl-py +absl-py==0.11.0 +astor==0.8.1 +baselines==0.1.6 +cached-property==1.5.2 +certifi==2020.12.5 +chardet==4.0.0 +click==8.0.0a1 cloudpickle==1.2.0 -dill~=0.3.3 +deepdiff==5.2.3 +dill==0.3.3 +enum34==1.1.10 +future==0.18.2 +gast==0.4.0 +google-pasta==0.2.0 +grpcio==1.36.0 +gym==0.15.7 +h5py==3.1.0 +idna==2.10 +importlib-metadata==3.7.0 +joblib==1.0.1 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.2 +Markdown==3.3.4 +mock==4.0.3 +mpyq==0.2.5 +numpy==1.20.1 +opencv-python==4.5.1.48 +ordered-set==4.0.2 +portpicker==1.3.1 +protobuf==3.15.3 +pygame==2.0.1 +pysc2==2.0 +requests==2.25.1 +s2clientprotocol==5.0.6.83830.0 +s2protocol==5.0.6.83830.0 +scipy==1.6.1 +six==1.15.0 +sk-video==1.1.10 +tensorboard==1.14.0 +tensorflow-estimator==1.14.0 tensorflow-gpu==1.14.0 -joblib~=1.0.1 -six~=1.15.0 -future~=0.18.2 -s2clientprotocol~=5.0.6.83830.0 \ No newline at end of file +termcolor==1.1.0 +tqdm==4.58.0 +typing-extensions==3.7.4.3 +urllib3==1.26.3 +websocket-client==0.57.0 +Werkzeug==1.0.1 +whichcraft==0.6.1 +wrapt==1.12.1 +zipp==3.4.0 From a25a1d682905c1efebf8113ea61272e543931d94 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Tue, 2 Mar 2021 19:20:01 -0500 Subject: [PATCH 10/11] Finally got things working again, and this time with a mostly legit requirements.txt file. --- train_mineral_shards_a2c.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py index ed0c835..647ea01 100644 --- a/train_mineral_shards_a2c.py +++ b/train_mineral_shards_a2c.py @@ -4,12 +4,13 @@ import random import absl +import baselines import pysc2.lib from baselines.logger import Logger, TensorBoardOutputFormat import common.vec_env.subproc_vec_env -import a2c +from a2c.a2c import learn from a2c.policies import CnnPolicy _MOVE_SCREEN = pysc2.lib.actions.FUNCTIONS.Move_screen.id @@ -20,8 +21,13 @@ step_mul = 8 FLAGS = absl.flags.FLAGS + +import sys + +FLAGS(sys.argv) + absl.flags.DEFINE_string("map", "CollectMineralShards", - "Name of a map to use to play.") + "Name of a map to use to play.") start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") absl.flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)") absl.flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.") @@ -58,8 +64,9 @@ def main(): lr_round = round(FLAGS.lr, 8) logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % ( - FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, - start_time) + FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, + lr_round, + start_time) Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) @@ -72,7 +79,7 @@ def main(): FLAGS.map) policy_fn = CnnPolicy - a2c.a2c.learn( + learn( policy_fn, env, seed, @@ -90,9 +97,6 @@ def main(): callback=a2c_callback) -import baselines - - def a2c_callback(locals, globals): global max_mean_reward, last_filename From 9035a5d5155e2f8110483b786dbc8684ab19aa32 Mon Sep 17 00:00:00 2001 From: rwill128 Date: Tue, 2 Mar 2021 19:40:24 -0500 Subject: [PATCH 11/11] Making corrections to requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f801d25..4450412 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,7 +38,7 @@ six==1.15.0 sk-video==1.1.10 tensorboard==1.14.0 tensorflow-estimator==1.14.0 -tensorflow-gpu==1.14.0 +tensorflow==1.14.0 termcolor==1.1.0 tqdm==4.58.0 typing-extensions==3.7.4.3