From 61d6dc57e21c1f42b397eb975dfb2bf12d59f100 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Wed, 24 Feb 2021 21:20:19 -0500
Subject: [PATCH 01/11] Make train_zerglings work with Tensorflow 1.14

---
 common/common.py                  |  8 ++++----
 common/vec_env/subproc_vec_env.py |  1 +
 defeat_zerglings/dqfd.py          | 14 +++++++-------
 requirements.txt                  |  3 ++-
 train_defeat_zerglings.py         |  7 +++++--
 train_mineral_shards.py           |  2 +-
 6 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/common/common.py b/common/common.py
index 82934ee..c5c0669 100644
--- a/common/common.py
+++ b/common/common.py
@@ -442,19 +442,19 @@ def shift(direction, number, matrix):
     in the specified (UP, DOWN, LEFT, RIGHT) direction and return it
 '''
   if direction in (UP):
-    matrix = np.roll(matrix, -number, axis=0)
+    matrix = np.roll(matrix.__array__(), -number, axis=0)
     matrix[number:, :] = -2
     return matrix
   elif direction in (DOWN):
-    matrix = np.roll(matrix, number, axis=0)
+    matrix = np.roll(matrix.__array__(), number, axis=0)
     matrix[:number, :] = -2
     return matrix
   elif direction in (LEFT):
-    matrix = np.roll(matrix, -number, axis=1)
+    matrix = np.roll(matrix.__array__(), -number, axis=1)
     matrix[:, number:] = -2
     return matrix
   elif direction in (RIGHT):
-    matrix = np.roll(matrix, number, axis=1)
+    matrix = np.roll(matrix.__array__(), number, axis=1)
     matrix[:, :number] = -2
     return matrix
   else:
diff --git a/common/vec_env/subproc_vec_env.py b/common/vec_env/subproc_vec_env.py
index 2952ba0..6cddac0 100644
--- a/common/vec_env/subproc_vec_env.py
+++ b/common/vec_env/subproc_vec_env.py
@@ -21,6 +21,7 @@ def worker(remote, map_name, nscripts, i):
   )
 
   with sc2_env.SC2Env(
+      players=[sc2_env.Agent(sc2_env.Race.terran)],
       agent_interface_format=[agent_format],
       map_name=map_name,
       step_mul=2) as env:
diff --git a/defeat_zerglings/dqfd.py b/defeat_zerglings/dqfd.py
index aa5707a..ae86e8e 100644
--- a/defeat_zerglings/dqfd.py
+++ b/defeat_zerglings/dqfd.py
@@ -6,7 +6,7 @@
 import zipfile
 
 from absl import flags
-
+from baselines_legacy import cnn_to_mlp, BatchInput
 import baselines.common.tf_util as U
 
 from baselines import logger
@@ -209,7 +209,7 @@ def learn(env,
   sess.__enter__()
 
   def make_obs_ph(name):
-    return U.BatchInput((64, 64), name=name)
+    return BatchInput((1, 32, 32), name=name)
 
   act, train, update_target, debug = deepq.build_train(
     make_obs_ph=make_obs_ph,
@@ -253,7 +253,7 @@ def make_obs_ph(name):
   obs = env.reset()
   # Select all marines first
 
-  player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
+  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
 
   screen = player_relative
 
@@ -296,7 +296,7 @@ def make_obs_ph(name):
       obs, screen, player = common.select_marine(env, obs)
 
       action = act(
-        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
+        np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0]
       reset = False
       rew = 0
 
@@ -315,14 +315,14 @@ def make_obs_ph(name):
         #print(e)
         1  # Do nothing
 
-      player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
+      player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
       new_screen = player_relative
 
       rew += obs[0].reward
 
       done = obs[0].step_type == environment.StepType.LAST
 
-      selected = obs[0].observation["screen"][_SELECTED]
+      selected = obs[0].observation["feature_screen"][_SELECTED]
       player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
 
       if (len(player_y) > 0):
@@ -351,7 +351,7 @@ def make_obs_ph(name):
       if done:
         print("Episode Reward : %s" % episode_rewards[-1])
         obs = env.reset()
-        player_relative = obs[0].observation["screen"][
+        player_relative = obs[0].observation["feature_screen"][
           _PLAYER_RELATIVE]
 
         screen = player_relative
diff --git a/requirements.txt b/requirements.txt
index fcbee81..029d0f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ git+https://github.com/openai/baselines
 numpy
 tensorflow
 absl-py
-cloudpickle
\ No newline at end of file
+cloudpickle
+dill
\ No newline at end of file
diff --git a/train_defeat_zerglings.py b/train_defeat_zerglings.py
index 0fba7e3..bc3d8f7 100644
--- a/train_defeat_zerglings.py
+++ b/train_defeat_zerglings.py
@@ -7,6 +7,7 @@
 from pysc2.env import sc2_env
 from pysc2.lib import actions
 from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat
+from baselines_legacy import cnn_to_mlp
 
 from defeat_zerglings import dqfd
 
@@ -75,9 +76,11 @@ def main():
       map_name="DefeatZerglingsAndBanelings",
       step_mul=step_mul,
       visualize=True,
-      game_steps_per_episode=steps * step_mul) as env:
+      game_steps_per_episode=steps * step_mul,
+      agent_interface_format=sc2_env.AgentInterfaceFormat(
+        feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))) as env:
 
-    model = deepq.models.cnn_to_mlp(
+    model = cnn_to_mlp(
       convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
       hiddens=[256],
       dueling=True
diff --git a/train_mineral_shards.py b/train_mineral_shards.py
index 6cd775d..3661718 100644
--- a/train_mineral_shards.py
+++ b/train_mineral_shards.py
@@ -6,7 +6,7 @@
 
 from pysc2.env import sc2_env
 from pysc2.lib import actions
-from baselines_legacy import cnn_to_mlp
+from baselines_legacy import cnn_to_mlp, BatchInput
 from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat
 
 from common.vec_env.subproc_vec_env import SubprocVecEnv

From 9a704d8ac14de9efa4396c6dcaf39cd116c3846d Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Wed, 24 Feb 2021 22:30:32 -0500
Subject: [PATCH 02/11] Reformatting some files, and also fixing action
 assignments so that they fit the 32*32 action space.

---
 a2c/a2c.py                        |   12 +-
 common/common.py                  | 1110 ++++++++++++++---------------
 common/vec_env/subproc_vec_env.py |   12 +-
 defeat_zerglings/dqfd.py          |  658 +++++++++--------
 train_defeat_zerglings.py         |    3 +-
 5 files changed, 890 insertions(+), 905 deletions(-)

diff --git a/a2c/a2c.py b/a2c/a2c.py
index b11d814..2739292 100644
--- a/a2c/a2c.py
+++ b/a2c/a2c.py
@@ -22,7 +22,7 @@
 _CONTROL_GROUP_RECALL = 0
 _NOT_QUEUED = 0
 
-# np.set_printoptions(threshold=np.inf)
+np.set_printoptions(threshold=np.inf)
 
 def mse(pred, target):
   return tf.square(pred-target)/2.
@@ -300,12 +300,12 @@ def __init__(self,
     self.group_id = [0 for _ in range(nenv)]
 
   def update_obs(self, obs):  # (self.nenv, 32, 32, 2)
-    #obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
+    obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
     self.obs = np.roll(self.obs, shift=-3, axis=3)
     new_map = np.zeros((self.nenv, 32, 32, 3))
     new_map[:, :, :, -1] = obs[:, 0, :, :]
     for env_num in range(self.nenv):
-      # print("xy_per_marine: ", self.xy_per_marine)
+      print("xy_per_marine: ", self.xy_per_marine)
       if "0" not in self.xy_per_marine[env_num]:
         self.xy_per_marine[env_num]["0"] = [0, 0]
       if "1" not in self.xy_per_marine[env_num]:
@@ -319,14 +319,14 @@ def update_obs(self, obs):  # (self.nenv, 32, 32, 2)
     # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32)
 
   def update_available(self, _available_actions):
-    #print("update_available : ", _available_actions)
+    print("update_available : ", _available_actions)
     self.available_actions = _available_actions
     # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]])
     self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8)
     for env_num, list in enumerate(_available_actions):
-      # print("env_num :", env_num, " list :", list)
+      print("env_num :", env_num, " list :", list)
       for action_num in list:
-        # print("action_num :", action_num)
+        print("action_num :", action_num)
         if (action_num == 4):
           self.base_act_mask[env_num][0] = 1
           self.base_act_mask[env_num][1] = 1
diff --git a/common/common.py b/common/common.py
index c5c0669..73dbd9f 100644
--- a/common/common.py
+++ b/common/common.py
@@ -32,653 +32,647 @@
 
 
 def init(env, obs):
-  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
-  # print("init")
-  army_count = env._obs[0].observation.player_common.army_count
-
-  player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
-
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-
-  # if(army_count==0):
-  #   return obs
-  # try:
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
-  #
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [[_SELECT_ALL]])])
-  # except Exception as e:
-  #   print(e)
-  # for i in range(len(player_x)):
-  #   if i % 4 != 0:
-  #     continue
-  #
-  #   xy = [player_x[i], player_y[i]]
-  #   obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])])
-
-  group_id = 0
-  group_list = []
-  unit_xy_list = []
-  last_xy = [0, 0]
-  xy_per_marine = {}
-  for i in range(len(player_x)):
-
-    if group_id > 9:
-      break
-
-    xy = [player_x[i], player_y[i]]
-    unit_xy_list.append(xy)
-
-    if (len(unit_xy_list) >= 1):
-      for idx, xy in enumerate(unit_xy_list):
-        if (idx == 0):
-          obs = env.step(actions=[
-            sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
-          ])
-        else:
-          obs = env.step(actions=[
-            sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy])
-          ])
-        last_xy = xy
-
-      obs = env.step(actions=[
-        sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP,
-                                 [[_CONTROL_GROUP_SET], [group_id]])
-      ])
-      unit_xy_list = []
-      xy_per_marine[str(group_id)] = last_xy
-
-      group_list.append(group_id)
-      group_id += 1
-
-  if len(unit_xy_list) >= 1:
-    for idx, xy in enumerate(unit_xy_list):
-      if idx == 0:
-        obs = env.step(actions=[
-            sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
-        ])
-      else:
+    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
+    # print("init")
+    army_count = env._obs[0].observation.player_common.army_count
+
+    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
+
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    # obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+
+    # if(army_count==0):
+    #   return obs
+    # try:
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
+    #
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [[_SELECT_ALL]])])
+    # except Exception as e:
+    #   print(e)
+    # for i in range(len(player_x)):
+    #   if i % 4 != 0:
+    #     continue
+    #
+    #   xy = [player_x[i], player_y[i]]
+    #   obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])])
+
+    group_id = 0
+    group_list = []
+    unit_xy_list = []
+    last_xy = [0, 0]
+    xy_per_marine = {}
+    for i in range(len(player_x)):
+
+        if group_id > 9:
+            break
+
+        xy = [player_x[i], player_y[i]]
+        unit_xy_list.append(xy)
+
+        if len(unit_xy_list) >= 1:
+            for idx, xy in enumerate(unit_xy_list):
+                if idx == 0:
+                    obs = env.step(actions=[
+                        sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
+                    ])
+                else:
+                    obs = env.step(actions=[
+                        sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy])
+                    ])
+                last_xy = xy
+
+            obs = env.step(actions=[
+                sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP,
+                                         [[_CONTROL_GROUP_SET], [group_id]])
+            ])
+            unit_xy_list = []
+            xy_per_marine[str(group_id)] = last_xy
+
+            group_list.append(group_id)
+            group_id += 1
+
+    if len(unit_xy_list) >= 1:
+        for idx, xy in enumerate(unit_xy_list):
+            if idx == 0:
+                obs = env.step(actions=[
+                    sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
+                ])
+            else:
+                obs = env.step(actions=[
+                    sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy])
+                ])
+            last_xy = xy
+
         obs = env.step(actions=[
-            sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy])
+            sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP,
+                                     [[_CONTROL_GROUP_SET], [group_id]])
         ])
-      last_xy = xy
+        xy_per_marine[str(group_id)] = last_xy
 
-    obs = env.step(actions=[
-        sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP,
-                               [[_CONTROL_GROUP_SET], [group_id]])
-    ])
-    xy_per_marine[str(group_id)] = last_xy
+        group_list.append(group_id)
+        group_id += 1
 
-    group_list.append(group_id)
-    group_id += 1
-
-  return obs, xy_per_marine
+    return obs, xy_per_marine
 
 
 def solve_tsp(
-    player_relative, 
-    selected, 
-    group_list, 
-    group_id, 
-    dest_per_marine,
-    xy_per_marine):
-  
-  my_dest = None
-  other_dest = None
-  closest, min_dist = None, None
-  actions = []
-  neutral_y, neutral_x = (player_relative == 1).nonzero()
-  player_y, player_x = (selected == 1).nonzero()
-
-  #for group_id in group_list:
-  if "0" in dest_per_marine and "1" in dest_per_marine:
-    if group_id == 0:
-      my_dest = dest_per_marine["0"]
-      other_dest = dest_per_marine["1"]
-    else:
-      my_dest = dest_per_marine["1"]
-      other_dest = dest_per_marine["0"]
+        player_relative,
+        selected,
+        group_list,
+        group_id,
+        dest_per_marine,
+        xy_per_marine):
+    my_dest = None
+    other_dest = None
+    closest, min_dist = None, None
+    actions = []
+    neutral_y, neutral_x = (player_relative == 1).nonzero()
+    player_y, player_x = (selected == 1).nonzero()
+
+    # for group_id in group_list:
+    if "0" in dest_per_marine and "1" in dest_per_marine:
+        if group_id == 0:
+            my_dest = dest_per_marine["0"]
+            other_dest = dest_per_marine["1"]
+        else:
+            my_dest = dest_per_marine["1"]
+            other_dest = dest_per_marine["0"]
 
-  if len(player_x) > 0:
-    if group_id == 0:
-      xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())]
-    else:
-      xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())]
-
-    player = xy_per_marine[str(group_id)]
-    points = [player]
-
-    for p in zip(neutral_x, neutral_y):
-
-      if other_dest:
-        dist = np.linalg.norm(np.array(other_dest) - np.array(p))
-        if dist < 10:
-          # print("continue since partner will take care of it ", p)
-          continue
-
-      pp = [p[0], p[1]]
-      if pp not in points:
-        points.append(pp)
-
-      dist = np.linalg.norm(np.array(player) - np.array(p))
-      if not min_dist or dist < min_dist:
-        closest, min_dist = p, dist
-
-    solve_tsp = False
-    if my_dest:
-      dist = np.linalg.norm(np.array(player) - np.array(my_dest))
-      if dist < 0.5:
-        solve_tsp = True
-
-    if my_dest is None:
-      solve_tsp = True
-
-    if len(points) < 2:
-      solve_tsp = False
-
-    if solve_tsp:
-      # function for printing best found solution when it is found
-      from time import clock
-      init = clock()
-
-      def report_sol(obj, s=""):
-        print("cpu:%g\tobj:%g\ttour:%s" % \
-              (clock(), obj, s))
-
-      n, D = mk_matrix(points, distL2)
-      niter = 50
-      tour, z = multistart_localsearch(niter, n, D)
-
-      left, right = None, None
-      for idx in tour:
-        if tour[idx] == 0:
-          if idx == len(tour) - 1:
-            right = points[tour[0]]
-            left = points[tour[idx - 1]]
-          elif idx == 0:
-            right = points[tour[idx + 1]]
-            left = points[tour[len(tour) - 1]]
-          else:
-            right = points[tour[idx + 1]]
-            left = points[tour[idx - 1]]
-
-      left_d = np.linalg.norm(np.array(player) - np.array(left))
-      right_d = np.linalg.norm(np.array(player) - np.array(right))
-      if right_d > left_d:
-        closest = left
-      else:
-        closest = right
-
-    #print("optimal next :" , closest)
-    dest_per_marine[str(group_id)] = closest
-    #print("dest_per_marine", self.dest_per_marine)
-    #dest_per_marine {'0': [56, 26], '1': [52, 6]}
-
-    if closest:
-      if group_id == 0:
-        actions.append({
-            "base_action": group_id,
-            "x0": closest[0],
-            "y0": closest[1]
-        })
-      else:
-        actions.append({
-            "base_action": group_id,
-            "x1": closest[0],
-            "y1": closest[1]
-        })
+    if len(player_x) > 0:
+        if group_id == 0:
+            xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())]
+        else:
+            xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())]
+
+        player = xy_per_marine[str(group_id)]
+        points = [player]
+
+        for p in zip(neutral_x, neutral_y):
+
+            if other_dest:
+                dist = np.linalg.norm(np.array(other_dest) - np.array(p))
+                if dist < 10:
+                    # print("continue since partner will take care of it ", p)
+                    continue
+
+            pp = [p[0], p[1]]
+            if pp not in points:
+                points.append(pp)
+
+            dist = np.linalg.norm(np.array(player) - np.array(p))
+            if not min_dist or dist < min_dist:
+                closest, min_dist = p, dist
+
+        solve_tsp = False
+        if my_dest:
+            dist = np.linalg.norm(np.array(player) - np.array(my_dest))
+            if dist < 0.5:
+                solve_tsp = True
+
+        if my_dest is None:
+            solve_tsp = True
+
+        if len(points) < 2:
+            solve_tsp = False
+
+        if solve_tsp:
+            # function for printing best found solution when it is found
+            from time import clock
+            init = clock()
+
+            def report_sol(obj, s=""):
+                print("cpu:%g\tobj:%g\ttour:%s" % \
+                      (clock(), obj, s))
+
+            n, D = mk_matrix(points, distL2)
+            niter = 50
+            tour, z = multistart_localsearch(niter, n, D)
+
+            left, right = None, None
+            for idx in tour:
+                if tour[idx] == 0:
+                    if idx == len(tour) - 1:
+                        right = points[tour[0]]
+                        left = points[tour[idx - 1]]
+                    elif idx == 0:
+                        right = points[tour[idx + 1]]
+                        left = points[tour[len(tour) - 1]]
+                    else:
+                        right = points[tour[idx + 1]]
+                        left = points[tour[idx - 1]]
+
+            left_d = np.linalg.norm(np.array(player) - np.array(left))
+            right_d = np.linalg.norm(np.array(player) - np.array(right))
+            if right_d > left_d:
+                closest = left
+            else:
+                closest = right
+
+        # print("optimal next :" , closest)
+        dest_per_marine[str(group_id)] = closest
+        # print("dest_per_marine", self.dest_per_marine)
+        # dest_per_marine {'0': [56, 26], '1': [52, 6]}
+
+        if closest:
+            if group_id == 0:
+                actions.append({
+                    "base_action": group_id,
+                    "x0": closest[0],
+                    "y0": closest[1]
+                })
+            else:
+                actions.append({
+                    "base_action": group_id,
+                    "x1": closest[0],
+                    "y1": closest[1]
+                })
+
+        elif my_dest:
+            if group_id == 0:
+                actions.append({
+                    "base_action": group_id,
+                    "x0": my_dest[0],
+                    "y0": my_dest[1]
+                })
+            else:
+                actions.append({
+                    "base_action": group_id,
+                    "x1": my_dest[0],
+                    "y1": my_dest[1]
+                })
+        else:
+            if group_id == 0:
+                actions.append({
+                    "base_action": 2,
+                    "x0": 0,
+                    "y0": 0
+                })
+            else:
+                actions.append({
+                    "base_action": 2,
+                    "x1": 0,
+                    "y1": 0
+                })
+
+    # elif(len(group_list)>0):
+    #
+    #   group_id = random.randint(0,len(group_list)-1)
+    #   actions.append({"base_action":group_id})
 
-    elif my_dest:
-      if group_id == 0:
-        actions.append({
-            "base_action": group_id,
-            "x0": my_dest[0],
-            "y0": my_dest[1]
-        })
-      else:
-        actions.append({
-            "base_action": group_id,
-            "x1": my_dest[0],
-            "y1": my_dest[1]
-        })
+    if group_id == 0:
+        group_id = 1
     else:
-      if group_id == 0:
-        actions.append({
-            "base_action": 2,
-            "x0": 0,
-            "y0": 0
-        })
-      else:
-        actions.append({
-            "base_action": 2,
-            "x1": 0,
-            "y1": 0
-        })
-
-  # elif(len(group_list)>0):
-  #
-  #   group_id = random.randint(0,len(group_list)-1)
-  #   actions.append({"base_action":group_id})
+        group_id = 0
 
-  if group_id == 0:
-    group_id = 1
-  else:
-    group_id = 0
+    if "0" not in xy_per_marine:
+        xy_per_marine["0"] = [0, 0]
+    if "1" not in xy_per_marine:
+        xy_per_marine["1"] = [0, 0]
 
-  if "0" not in xy_per_marine:
-    xy_per_marine["0"] = [0, 0]
-  if "1" not in xy_per_marine:
-    xy_per_marine["1"] = [0, 0]
-
-  return actions, group_id, dest_per_marine, xy_per_marine
+    return actions, group_id, dest_per_marine, xy_per_marine
 
 
 def group_init_queue(player_relative):
+    actions = []
+
+    player_x, player_y = (player_relative == _PLAYER_FRIENDLY).nonzero()
+    # try:
+    #
+    #   player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
+    #   actions.append({"base_action":_SELECT_ARMY, "sub7":_SELECT_ALL})
+    #
+    # except Exception as e:
+    #   print(e)
+    # for i in range(len(player_x)):
+    #   if i % 4 != 0:
+    #     continue
+    #
+    #   xy = [player_x[i], player_y[i]]
+    #   actions.append({"base_action":_SELECT_POINT, "sub6":0, "x0":xy[0], "y0":xy[1]})
+
+    group_id = 0
+    group_list = []
+    unit_xy_list = []
+    for i in range(len(player_x)):
+
+        if group_id > 9:
+            break
+
+        xy = [player_x[i], player_y[i]]
+        unit_xy_list.append(xy)
+        # 2/select_point (6/select_point_act [4]; 0/screen [84, 84])
+        # 4/select_control_group (4/control_group_act [5]; 5/control_group_id [10])
+        if len(unit_xy_list) >= 1:
+            for idx, xy in enumerate(unit_xy_list):
+                if idx == 0:
+                    actions.append({
+                        "base_action": _SELECT_POINT,
+                        "sub6": 0,
+                        "x0": xy[0],
+                        "y0": xy[1]
+                    })
+                else:
+                    actions.append({
+                        "base_action": _SELECT_POINT,
+                        "sub6": 1,
+                        "x0": xy[0],
+                        "y0": xy[1]
+                    })
+
+            actions.append({
+                "base_action": _SELECT_CONTROL_GROUP,
+                "sub4": _CONTROL_GROUP_SET,
+                "sub5": group_id
+            })
+            unit_xy_list = []
+
+            group_list.append(group_id)
+            group_id += 1
+
+    if len(unit_xy_list) >= 1:
+        for idx, xy in enumerate(unit_xy_list):
+            if idx == 0:
+                actions.append({
+                    "base_action": _SELECT_POINT,
+                    "sub6": 0,
+                    "x0": xy[0],
+                    "y0": xy[1]
+                })
+            else:
+                actions.append({
+                    "base_action": _SELECT_POINT,
+                    "sub6": 1,
+                    "x0": xy[0],
+                    "y0": xy[1]
+                })
 
-  actions = []
-
-  player_x, player_y = (player_relative == _PLAYER_FRIENDLY).nonzero()
-  # try:
-  #
-  #   player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
-  #   actions.append({"base_action":_SELECT_ARMY, "sub7":_SELECT_ALL})
-  #
-  # except Exception as e:
-  #   print(e)
-  # for i in range(len(player_x)):
-  #   if i % 4 != 0:
-  #     continue
-  #
-  #   xy = [player_x[i], player_y[i]]
-  #   actions.append({"base_action":_SELECT_POINT, "sub6":0, "x0":xy[0], "y0":xy[1]})
-
-  group_id = 0
-  group_list = []
-  unit_xy_list = []
-  for i in range(len(player_x)):
-
-    if group_id > 9:
-      break
-
-    xy = [player_x[i], player_y[i]]
-    unit_xy_list.append(xy)
-    # 2/select_point (6/select_point_act [4]; 0/screen [84, 84])
-    # 4/select_control_group (4/control_group_act [5]; 5/control_group_id [10])
-    if (len(unit_xy_list) >= 1):
-      for idx, xy in enumerate(unit_xy_list):
-        if (idx == 0):
-          actions.append({
-            "base_action": _SELECT_POINT,
-            "sub6": 0,
-            "x0": xy[0],
-            "y0": xy[1]
-          })
-        else:
-          actions.append({
-            "base_action": _SELECT_POINT,
-            "sub6": 1,
-            "x0": xy[0],
-            "y0": xy[1]
-          })
-
-      actions.append({
-        "base_action": _SELECT_CONTROL_GROUP,
-        "sub4": _CONTROL_GROUP_SET,
-        "sub5": group_id
-      })
-      unit_xy_list = []
-
-      group_list.append(group_id)
-      group_id += 1
-
-  if (len(unit_xy_list) >= 1):
-    for idx, xy in enumerate(unit_xy_list):
-      if (idx == 0):
-        actions.append({
-          "base_action": _SELECT_POINT,
-          "sub6": 0,
-          "x0": xy[0],
-          "y0": xy[1]
-        })
-      else:
         actions.append({
-          "base_action": _SELECT_POINT,
-          "sub6": 1,
-          "x0": xy[0],
-          "y0": xy[1]
+            "base_action": _SELECT_CONTROL_GROUP,
+            "sub4": _CONTROL_GROUP_SET,
+            "sub5": group_id
         })
 
-    actions.append({
-      "base_action": _SELECT_CONTROL_GROUP,
-      "sub4": _CONTROL_GROUP_SET,
-      "sub5": group_id
-    })
-
-    group_list.append(group_id)
-    group_id += 1
+        group_list.append(group_id)
+        group_id += 1
 
-  return actions
+    return actions
 
 
 def update_group_list2(control_group):
+    group_count = 0
+    group_list = []
 
-  group_count = 0
-  group_list = []
+    for control_group_id, data in enumerate(control_group):
 
-  for control_group_id, data in enumerate(control_group):
+        unit_id = data[0]
+        count = data[1]
 
-    unit_id = data[0]
-    count = data[1]
+        if unit_id != 0:
+            group_count += 1
+            group_list.append(control_group_id)
 
-    if (unit_id != 0):
-      group_count += 1
-      group_list.append(control_group_id)
-
-  return group_list
+    return group_list
 
 
 def check_group_list2(extra):
-  army_count = 0
-  # (64, 64, 3)
-  for control_group_id in range(10):
-    unit_id = extra[control_group_id, 1]
-    count = extra[control_group_id, 2]
-    if (unit_id != 0):
-      army_count += count
+    army_count = 0
+    # (64, 64, 3)
+    for control_group_id in range(10):
+        unit_id = extra[control_group_id, 1]
+        count = extra[control_group_id, 2]
+        if unit_id != 0:
+            army_count += count
 
-  if (army_count != extra[0, 0]):
-    return True
+    if army_count != extra[0, 0]:
+        return True
 
-  return False
+    return False
 
 
 def update_group_list(obs):
-  control_groups = obs[0].observation["control_groups"]
-  group_count = 0
-  group_list = []
-  for id, group in enumerate(control_groups):
-    if (group[0] != 0):
-      group_count += 1
-      group_list.append(id)
-  return group_list
+    control_groups = obs[0].observation["control_groups"]
+    group_count = 0
+    group_list = []
+    for id, group in enumerate(control_groups):
+        if group[0] != 0:
+            group_count += 1
+            group_list.append(id)
+    return group_list
 
 
 def check_group_list(env, obs):
-  error = False
-  control_groups = obs[0].observation["control_groups"]
-  army_count = 0
-  for id, group in enumerate(control_groups):
-    if (group[0] == 48):
-      army_count += group[1]
-      if (group[1] != 1):
-        #print("group error group_id : %s count : %s" % (id, group[1]))
+    error = False
+    control_groups = obs[0].observation["control_groups"]
+    army_count = 0
+    for id, group in enumerate(control_groups):
+        if group[0] == 48:
+            army_count += group[1]
+            if group[1] != 1:
+                print("group error group_id : %s count : %s" % (id, group[1]))
+                error = True
+                return error
+    if army_count != env._obs[0].observation.player_common.army_count:
         error = True
-        return error
-  if (army_count != env._obs[0].observation.player_common.army_count):
-    error = True
-    # print("army_count %s !=  %s env._obs.observation.player_common.army_count "
-    #      % (army_count, env._obs.observation.player_common.army_count))
+        print("army_count %s !=  %s env._obs.observation.player_common.army_count " % (army_count, env._obs[0].observation.player_common.army_count))
 
-  return error
+    return error
 
 
 UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right'
 
 
 def shift(direction, number, matrix):
-  ''' shift given 2D matrix in-place the given number of rows or columns
-    in the specified (UP, DOWN, LEFT, RIGHT) direction and return it
-'''
-  if direction in (UP):
-    matrix = np.roll(matrix.__array__(), -number, axis=0)
-    matrix[number:, :] = -2
-    return matrix
-  elif direction in (DOWN):
-    matrix = np.roll(matrix.__array__(), number, axis=0)
-    matrix[:number, :] = -2
-    return matrix
-  elif direction in (LEFT):
-    matrix = np.roll(matrix.__array__(), -number, axis=1)
-    matrix[:, number:] = -2
-    return matrix
-  elif direction in (RIGHT):
-    matrix = np.roll(matrix.__array__(), number, axis=1)
-    matrix[:, :number] = -2
-    return matrix
-  else:
-    return matrix
+    ''' shift given 2D matrix in-place the given number of rows or columns
+      in the specified (UP, DOWN, LEFT, RIGHT) direction and return it
+    '''
+    if direction in UP:
+        matrix = np.roll(matrix.__array__(), -number, axis=0)
+        matrix[number:, :] = -2
+        return matrix
+    elif direction in DOWN:
+        matrix = np.roll(matrix.__array__(), number, axis=0)
+        matrix[:number, :] = -2
+        return matrix
+    elif direction in LEFT:
+        matrix = np.roll(matrix.__array__(), -number, axis=1)
+        matrix[:, number:] = -2
+        return matrix
+    elif direction in RIGHT:
+        matrix = np.roll(matrix.__array__(), number, axis=1)
+        matrix[:, :number] = -2
+        return matrix
+    else:
+        return matrix
 
 
 def select_marine(env, obs):
+    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
+    screen = player_relative
 
-  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
-  screen = player_relative
+    group_list = update_group_list(obs)
 
-  group_list = update_group_list(obs)
+    if check_group_list(env, obs):
+        obs, xy_per_marine = init(env, obs)
+        group_list = update_group_list(obs)
 
-  if (check_group_list(env, obs)):
-    obs, xy_per_marine = init(env, obs)
-    group_list = update_group_list(obs)
+    # if(len(group_list) == 0):
+    #   obs = init(env, player_relative, obs)
+    #   group_list = update_group_list(obs)
 
-  # if(len(group_list) == 0):
-  #   obs = init(env, player_relative, obs)
-  #   group_list = update_group_list(obs)
-
-  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
-
-  friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
-
-  enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero()
-
-  player = []
-
-  danger_closest, danger_min_dist = None, None
-  for e in zip(enemy_x, enemy_y):
-    for p in zip(friendly_x, friendly_y):
-      dist = np.linalg.norm(np.array(p) - np.array(e))
-      if not danger_min_dist or dist < danger_min_dist:
-        danger_closest, danger_min_dist = p, dist
-
-  marine_closest, marine_min_dist = None, None
-  for e in zip(friendly_x, friendly_y):
-    for p in zip(friendly_x, friendly_y):
-      dist = np.linalg.norm(np.array(p) - np.array(e))
-      if not marine_min_dist or dist < marine_min_dist:
-        if dist >= 2:
-          marine_closest, marine_min_dist = p, dist
-
-  if (danger_min_dist != None and danger_min_dist <= 5):
-    obs = env.step(actions=[
-      sc2_actions.FunctionCall(_SELECT_POINT, [[0], danger_closest])
-    ])
-
-    selected = obs[0].observation["feature_screen"][_SELECTED]
-    player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
-    if (len(player_y) > 0):
-      player = [int(player_x.mean()), int(player_y.mean())]
-
-  elif (marine_closest != None and marine_min_dist <= 3):
-    obs = env.step(actions=[
-      sc2_actions.FunctionCall(_SELECT_POINT, [[0], marine_closest])
-    ])
-
-    selected = obs[0].observation["feature_screen"][_SELECTED]
-    player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
-    if (len(player_y) > 0):
-      player = [int(player_x.mean()), int(player_y.mean())]
-
-  else:
-
-    # If there is no marine in danger, select random
-    while (len(group_list) > 0):
-      # units = env._obs.observation.raw_data.units
-      # marine_list = []          # for unit in units:
-      #   if(unit.alliance == 1):
-      #     marine_list.append(unit)
-
-      group_id = np.random.choice(group_list)
-      #xy = [int(unit.pos.y - 10), int(unit.pos.x+8)]
-      #print("check xy : %s - %s" % (xy, player_relative[xy[0],xy[1]]))
-      obs = env.step(actions=[
-        sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, [[
-          _CONTROL_GROUP_RECALL
-        ], [int(group_id)]])
-      ])
-
-      selected = obs[0].observation["feature_screen"][_SELECTED]
-      player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
-      if (len(player_y) > 0):
-        player = [int(player_x.mean()), int(player_y.mean())]
-        break
-      else:
-        group_list.remove(group_id)
-
-  if (len(player) == 2):
-
-    if (player[0] > 32):
-      screen = shift(LEFT, player[0] - 32, screen)
-    elif (player[0] < 32):
-      screen = shift(RIGHT, 32 - player[0], screen)
-
-    if (player[1] > 32):
-      screen = shift(UP, player[1] - 32, screen)
-    elif (player[1] < 32):
-      screen = shift(DOWN, 32 - player[1], screen)
-
-  return obs, screen, player
+    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
 
+    friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
 
-def marine_action(env, obs, player, action):
+    enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero()
+
+    player = []
+
+    danger_closest, danger_min_dist = None, None
+    for e in zip(enemy_x, enemy_y):
+        for p in zip(friendly_x, friendly_y):
+            dist = np.linalg.norm(np.array(p) - np.array(e))
+            if not danger_min_dist or dist < danger_min_dist:
+                danger_closest, danger_min_dist = p, dist
 
-  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
+    marine_closest, marine_min_dist = None, None
+    for e in zip(friendly_x, friendly_y):
+        for p in zip(friendly_x, friendly_y):
+            dist = np.linalg.norm(np.array(p) - np.array(e))
+            if not marine_min_dist or dist < marine_min_dist:
+                if dist >= 2:
+                    marine_closest, marine_min_dist = p, dist
+
+    if danger_min_dist is not None and danger_min_dist <= 5:
+        obs = env.step(actions=[
+            sc2_actions.FunctionCall(_SELECT_POINT, [[0], danger_closest])
+        ])
+
+        selected = obs[0].observation["feature_screen"][_SELECTED]
+        player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
+        if len(player_y) > 0:
+            player = [int(player_x.mean()), int(player_y.mean())]
+
+    elif marine_closest is not None and marine_min_dist <= 3:
+        obs = env.step(actions=[
+            sc2_actions.FunctionCall(_SELECT_POINT, [[0], marine_closest])
+        ])
+
+        selected = obs[0].observation["feature_screen"][_SELECTED]
+        player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
+        if len(player_y) > 0:
+            player = [int(player_x.mean()), int(player_y.mean())]
+
+    else:
+
+        # If there is no marine in danger, select random
+        while len(group_list) > 0:
+            # units = env._obs.observation.raw_data.units
+            # marine_list = []          # for unit in units:
+            #   if(unit.alliance == 1):
+            #     marine_list.append(unit)
+
+            group_id = np.random.choice(group_list)
+            # xy = [int(unit.pos.y - 10), int(unit.pos.x+8)]
+            # print("check xy : %s - %s" % (xy, player_relative[xy[0],xy[1]]))
+            obs = env.step(actions=[
+                sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, [[
+                    _CONTROL_GROUP_RECALL
+                ], [int(group_id)]])
+            ])
+
+            selected = obs[0].observation["feature_screen"][_SELECTED]
+            player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
+            if len(player_y) > 0:
+                player = [int(player_x.mean()), int(player_y.mean())]
+                break
+            else:
+                group_list.remove(group_id)
+
+    if len(player) == 2:
+
+        if player[0] > 32:
+            screen = shift(LEFT, player[0] - 32, screen)
+        elif player[0] < 32:
+            screen = shift(RIGHT, 32 - player[0], screen)
+
+        if player[1] > 32:
+            screen = shift(UP, player[1] - 32, screen)
+        elif player[1] < 32:
+            screen = shift(DOWN, 32 - player[1], screen)
+
+    return obs, screen, player
+
+
+def marine_action(env, obs, player, action):
+    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
 
-  enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero()
+    enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero()
 
-  closest, min_dist = None, None
+    closest, min_dist = None, None
 
-  if (len(player) == 2):
-    for p in zip(enemy_x, enemy_y):
-      dist = np.linalg.norm(np.array(player) - np.array(p))
-      if not min_dist or dist < min_dist:
-        closest, min_dist = p, dist
+    if len(player) == 2:
+        for p in zip(enemy_x, enemy_y):
+            dist = np.linalg.norm(np.array(player) - np.array(p))
+            if not min_dist or dist < min_dist:
+                closest, min_dist = p, dist
 
-  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
-  friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
+    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
+    friendly_y, friendly_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
 
-  closest_friend, min_dist_friend = None, None
-  if (len(player) == 2):
-    for p in zip(friendly_x, friendly_y):
-      dist = np.linalg.norm(np.array(player) - np.array(p))
-      if not min_dist_friend or dist < min_dist_friend:
-        closest_friend, min_dist_friend = p, dist
+    closest_friend, min_dist_friend = None, None
+    if len(player) == 2:
+        for p in zip(friendly_x, friendly_y):
+            dist = np.linalg.norm(np.array(player) - np.array(p))
+            if not min_dist_friend or dist < min_dist_friend:
+                closest_friend, min_dist_friend = p, dist
 
-  if (closest == None):
+    if closest is None:
 
-    new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
+        new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
 
-  elif (action == 0 and closest_friend != None and min_dist_friend < 3):
-    # Friendly marine is too close => Sparse!
+    elif action == 0 and closest_friend is not None and min_dist_friend < 3:
+        # Friendly marine is too close => Sparse!
 
-    mean_friend = [int(friendly_x.mean()), int(friendly_x.mean())]
+        mean_friend = [int(friendly_x.mean()), int(friendly_x.mean())]
 
-    diff = np.array(player) - np.array(closest_friend)
+        diff = np.array(player) - np.array(closest_friend)
 
-    norm = np.linalg.norm(diff)
+        norm = np.linalg.norm(diff)
 
-    if (norm != 0):
-      diff = diff / norm
+        if norm != 0:
+            diff = diff / norm
 
-    coord = np.array(player) + diff * 4
+        coord = np.array(player) + diff * 4
 
-    if (coord[0] < 0):
-      coord[0] = 0
-    elif (coord[0] > 63):
-      coord[0] = 63
+        if coord[0] < 0:
+            coord[0] = 0
+        elif coord[0] > 31:
+            coord[0] = 31
 
-    if (coord[1] < 0):
-      coord[1] = 0
-    elif (coord[1] > 63):
-      coord[1] = 63
+        if coord[1] < 0:
+            coord[1] = 0
+        elif coord[1] > 31:
+            coord[1] = 31
 
-    new_action = [
-      sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+        new_action = [
+            sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-  elif (action <= 1):  #Attack
+    elif action <= 1:  # Attack
 
-    # nearest enemy
+        # nearest enemy
 
-    coord = closest
+        coord = closest
 
-    new_action = [
-      sc2_actions.FunctionCall(_ATTACK_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+        new_action = [
+            sc2_actions.FunctionCall(_ATTACK_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-    #print("action : %s Attack Coord : %s" % (action, coord))
+        # print("action : %s Attack Coord : %s" % (action, coord))
 
-  elif (action == 2):  # Oppsite direcion from enemy
+    elif action == 2:  # Oppsite direcion from enemy
 
-    # nearest enemy opposite
+        # nearest enemy opposite
 
-    diff = np.array(player) - np.array(closest)
+        diff = np.array(player) - np.array(closest)
 
-    norm = np.linalg.norm(diff)
+        norm = np.linalg.norm(diff)
 
-    if (norm != 0):
-      diff = diff / norm
+        if norm != 0:
+            diff = diff / norm
 
-    coord = np.array(player) + diff * 7
+        coord = np.array(player) + diff * 7
 
-    if (coord[0] < 0):
-      coord[0] = 0
-    elif (coord[0] > 63):
-      coord[0] = 63
+        if coord[0] < 0:
+            coord[0] = 0
+        elif coord[0] > 31:
+            coord[0] = 31
 
-    if (coord[1] < 0):
-      coord[1] = 0
-    elif (coord[1] > 63):
-      coord[1] = 63
+        if coord[1] < 0:
+            coord[1] = 0
+        elif coord[1] > 31:
+            coord[1] = 31
 
-    new_action = [
-      sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+        new_action = [
+            sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-  elif (action == 4):  #UP
-    coord = [player[0], player[1] - 3]
-    new_action = [
-      sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+    elif action == 4:  # UP
+        coord = [player[0], player[1] - 3]
+        new_action = [
+            sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-  elif (action == 5):  #DOWN
-    coord = [player[0], player[1] + 3]
-    new_action = [
-      sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+    elif action == 5:  # DOWN
+        coord = [player[0], player[1] + 3]
+        new_action = [
+            sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-  elif (action == 6):  #LEFT
-    coord = [player[0] - 3, player[1]]
-    new_action = [
-      sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+    elif action == 6:  # LEFT
+        coord = [player[0] - 3, player[1]]
+        new_action = [
+            sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-  elif (action == 7):  #RIGHT
-    coord = [player[0] + 3, player[1]]
-    new_action = [
-      sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
-    ]
+    elif action == 7:  # RIGHT
+        coord = [player[0] + 3, player[1]]
+        new_action = [
+            sc2_actions.FunctionCall(_MOVE_SCREEN, [[_NOT_QUEUED], coord])
+        ]
 
-    #print("action : %s Back Coord : %s" % (action, coord))
+        print("action : %s Back Coord : %s" % (action, coord))
 
-  return obs, new_action
+    return obs, new_action
diff --git a/common/vec_env/subproc_vec_env.py b/common/vec_env/subproc_vec_env.py
index 6cddac0..16fcb99 100644
--- a/common/vec_env/subproc_vec_env.py
+++ b/common/vec_env/subproc_vec_env.py
@@ -41,10 +41,10 @@ def worker(remote, map_name, nscripts, i):
 
         action1 = data[0][0]
         action2 = data[0][1]
-        # func = actions.FUNCTIONS[action1[0]]
-        # print("agent(",i," ) action : ", action1, " func : ", func)
+        func = actions.FUNCTIONS[action1[0]]
+        print("agent(",i," ) action : ", action1, " func : ", func)
         func = actions.FUNCTIONS[action2[0]]
-        # print("agent(",i," ) action : ", action2, " func : ", func)
+        print("agent(",i," ) action : ", action2, " func : ", func)
 
 
         result = env.step(actions=[action1])
@@ -55,10 +55,10 @@ def worker(remote, map_name, nscripts, i):
 
         if len(action2[1]) == 2:
           x, y = action2[1][1]
-          # print("x, y:", x, y)
+          print("x, y:", x, y)
 
-          # if x == 0 and y == 0:
-          #   move = False
+          if x == 0 and y == 0:
+            move = False
 
         if (331 in available_actions and move and not done):
           try:
diff --git a/defeat_zerglings/dqfd.py b/defeat_zerglings/dqfd.py
index ae86e8e..0f1aee7 100644
--- a/defeat_zerglings/dqfd.py
+++ b/defeat_zerglings/dqfd.py
@@ -50,65 +50,65 @@
 
 
 class ActWrapper(object):
-  def __init__(self, act):
-    self._act = act
-    #self._act_params = act_params
-
-  @staticmethod
-  def load(path, act_params, num_cpu=16):
-    with open(path, "rb") as f:
-      model_data = dill.load(f)
-    act = deepq.build_act(**act_params)
-    sess = U.make_session(num_cpu=num_cpu)
-    sess.__enter__()
-    with tempfile.TemporaryDirectory() as td:
-      arc_path = os.path.join(td, "packed.zip")
-      with open(arc_path, "wb") as f:
-        f.write(model_data)
-
-      zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
-      U.load_state(os.path.join(td, "model"))
-
-    return ActWrapper(act)
-
-  def __call__(self, *args, **kwargs):
-    return self._act(*args, **kwargs)
-
-  def save(self, path):
-    """Save model to a pickle located at `path`"""
-    with tempfile.TemporaryDirectory() as td:
-      U.save_state(os.path.join(td, "model"))
-      arc_name = os.path.join(td, "packed.zip")
-      with zipfile.ZipFile(arc_name, 'w') as zipf:
-        for root, dirs, files in os.walk(td):
-          for fname in files:
-            file_path = os.path.join(root, fname)
-            if file_path != arc_name:
-              zipf.write(file_path,
-                         os.path.relpath(file_path, td))
-      with open(arc_name, "rb") as f:
-        model_data = f.read()
-    with open(path, "wb") as f:
-      dill.dump((model_data), f)
+    def __init__(self, act):
+        self._act = act
+        # self._act_params = act_params
+
+    @staticmethod
+    def load(path, act_params, num_cpu=16):
+        with open(path, "rb") as f:
+            model_data = dill.load(f)
+        act = deepq.build_act(**act_params)
+        sess = U.make_session(num_cpu=num_cpu)
+        sess.__enter__()
+        with tempfile.TemporaryDirectory() as td:
+            arc_path = os.path.join(td, "packed.zip")
+            with open(arc_path, "wb") as f:
+                f.write(model_data)
+
+            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
+            U.load_state(os.path.join(td, "model"))
+
+        return ActWrapper(act)
+
+    def __call__(self, *args, **kwargs):
+        return self._act(*args, **kwargs)
+
+    def save(self, path):
+        """Save model to a pickle located at `path`"""
+        with tempfile.TemporaryDirectory() as td:
+            U.save_state(os.path.join(td, "model"))
+            arc_name = os.path.join(td, "packed.zip")
+            with zipfile.ZipFile(arc_name, 'w') as zipf:
+                for root, dirs, files in os.walk(td):
+                    for fname in files:
+                        file_path = os.path.join(root, fname)
+                        if file_path != arc_name:
+                            zipf.write(file_path,
+                                       os.path.relpath(file_path, td))
+            with open(arc_name, "rb") as f:
+                model_data = f.read()
+        with open(path, "wb") as f:
+            dill.dump((model_data), f)
 
 
 def load(path, act_params, num_cpu=16):
-  """Load act function that was returned by learn function.
+    """Load act function that was returned by learn function.
 
-Parameters
-----------
-path: str
-    path to the act function pickle
-num_cpu: int
-    number of cpus to use for executing the policy
+  Parameters
+  ----------
+  path: str
+      path to the act function pickle
+  num_cpu: int
+      number of cpus to use for executing the policy
 
-Returns
--------
-act: ActWrapper
-    function that takes a batch of observations
-    and returns actions.
-"""
-  return ActWrapper.load(path, num_cpu=num_cpu, act_params=act_params)
+  Returns
+  -------
+  act: ActWrapper
+      function that takes a batch of observations
+      and returns actions.
+  """
+    return ActWrapper.load(path, num_cpu=num_cpu, act_params=act_params)
 
 
 def learn(env,
@@ -136,283 +136,273 @@ def learn(env,
           param_noise_threshold=0.05,
           callback=None,
           demo_replay=[]):
-  """Train a deepq model.
-
-Parameters
--------
-env: pysc2.env.SC2Env
-    environment to train on
-q_func: (tf.Variable, int, str, bool) -> tf.Variable
-    the model that takes the following inputs:
-        observation_in: object
-            the output of observation placeholder
-        num_actions: int
-            number of actions
-        scope: str
-        reuse: bool
-            should be passed to outer variable scope
-    and returns a tensor of shape (batch_size, num_actions) with values of every action.
-lr: float
-    learning rate for adam optimizer
-max_timesteps: int
-    number of env steps to optimizer for
-buffer_size: int
-    size of the replay buffer
-exploration_fraction: float
-    fraction of entire training period over which the exploration rate is annealed
-exploration_final_eps: float
-    final value of random action probability
-train_freq: int
-    update the model every `train_freq` steps.
-    set to None to disable printing
-batch_size: int
-    size of a batched sampled from replay buffer for training
-print_freq: int
-    how often to print out training progress
-    set to None to disable printing
-checkpoint_freq: int
-    how often to save the model. This is so that the best version is restored
-    at the end of the training. If you do not wish to restore the best version at
-    the end of the training set this variable to None.
-learning_starts: int
-    how many steps of the model to collect transitions for before learning starts
-gamma: float
-    discount factor
-target_network_update_freq: int
-    update the target network every `target_network_update_freq` steps.
-prioritized_replay: True
-    if True prioritized replay buffer will be used.
-prioritized_replay_alpha: float
-    alpha parameter for prioritized replay buffer
-prioritized_replay_beta0: float
-    initial value of beta for prioritized replay buffer
-prioritized_replay_beta_iters: int
-    number of iterations over which beta will be annealed from initial value
-    to 1.0. If set to None equals to max_timesteps.
-prioritized_replay_eps: float
-    epsilon to add to the TD errors when updating priorities.
-num_cpu: int
-    number of cpus to use for training
-callback: (locals, globals) -> None
-    function called at every steps with state of the algorithm.
-    If callback returns true training stops.
-
-Returns
--------
-act: ActWrapper
-    Wrapper over act function. Adds ability to save it and load it.
-    See header of baselines/deepq/categorical.py for details on the act function.
-"""
-  # Create all the functions necessary to train the model
-
-  sess = U.make_session(num_cpu=num_cpu)
-  sess.__enter__()
-
-  def make_obs_ph(name):
-    return BatchInput((1, 32, 32), name=name)
-
-  act, train, update_target, debug = deepq.build_train(
-    make_obs_ph=make_obs_ph,
-    q_func=q_func,
-    num_actions=num_actions,
-    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
-    gamma=gamma,
-    grad_norm_clipping=10)
-  act_params = {
-    'make_obs_ph': make_obs_ph,
-    'q_func': q_func,
-    'num_actions': num_actions,
-  }
-
-  # Create the replay buffer
-  if prioritized_replay:
-    replay_buffer = PrioritizedReplayBuffer(
-      buffer_size, alpha=prioritized_replay_alpha)
-    if prioritized_replay_beta_iters is None:
-      prioritized_replay_beta_iters = max_timesteps
-    beta_schedule = LinearSchedule(
-      prioritized_replay_beta_iters,
-      initial_p=prioritized_replay_beta0,
-      final_p=1.0)
-  else:
-    replay_buffer = ReplayBuffer(buffer_size)
-    beta_schedule = None
-  # Create the schedule for exploration starting from 1.
-  exploration = LinearSchedule(
-    schedule_timesteps=int(exploration_fraction * max_timesteps),
-    initial_p=1.0,
-    final_p=exploration_final_eps)
-
-  # Initialize the parameters and copy them to the target network.
-  U.initialize()
-  update_target()
-
-  episode_rewards = [0.0]
-  saved_mean_reward = None
-
-  obs = env.reset()
-  # Select all marines first
-
-  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
-
-  screen = player_relative
-
-  obs, xy_per_marine = common.init(env, obs)
-
-  group_id = 0
-  reset = True
-  with tempfile.TemporaryDirectory() as td:
-    model_saved = False
-    model_file = os.path.join(td, "model")
-
-    for t in range(max_timesteps):
-      if callback is not None:
-        if callback(locals(), globals()):
-          break
-      # Take action and update exploration to the newest value
-      kwargs = {}
-      if not param_noise:
-        update_eps = exploration.value(t)
-        update_param_noise_threshold = 0.
-      else:
-        update_eps = 0.
-        if param_noise_threshold >= 0.:
-          update_param_noise_threshold = param_noise_threshold
-        else:
-          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
-          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
-          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
-          # for detailed explanation.
-          update_param_noise_threshold = -np.log(
-            1. - exploration.value(t) +
-            exploration.value(t) / float(num_actions))
-        kwargs['reset'] = reset
-        kwargs[
-          'update_param_noise_threshold'] = update_param_noise_threshold
-        kwargs['update_param_noise_scale'] = True
-
-      # custom process for DefeatZerglingsAndBanelings
-
-      obs, screen, player = common.select_marine(env, obs)
-
-      action = act(
-        np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0]
-      reset = False
-      rew = 0
-
-      new_action = None
-
-      obs, new_action = common.marine_action(env, obs, player, action)
-      army_count = env._obs[0].observation.player_common.army_count
-
-      try:
-        if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
-          obs = env.step(actions=new_action)
-        else:
-          new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
-          obs = env.step(actions=new_action)
-      except Exception as e:
-        #print(e)
-        1  # Do nothing
-
-      player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
-      new_screen = player_relative
-
-      rew += obs[0].reward
-
-      done = obs[0].step_type == environment.StepType.LAST
-
-      selected = obs[0].observation["feature_screen"][_SELECTED]
-      player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
-
-      if (len(player_y) > 0):
-        player = [int(player_x.mean()), int(player_y.mean())]
-
-      if (len(player) == 2):
-
-        if (player[0] > 32):
-          new_screen = common.shift(LEFT, player[0] - 32, new_screen)
-        elif (player[0] < 32):
-          new_screen = common.shift(RIGHT, 32 - player[0],
-                                    new_screen)
-
-        if (player[1] > 32):
-          new_screen = common.shift(UP, player[1] - 32, new_screen)
-        elif (player[1] < 32):
-          new_screen = common.shift(DOWN, 32 - player[1], new_screen)
-
-      # Store transition in the replay buffer.
-      replay_buffer.add(screen, action, rew, new_screen, float(done))
-      screen = new_screen
-
-      episode_rewards[-1] += rew
-      reward = episode_rewards[-1]
-
-      if done:
-        print("Episode Reward : %s" % episode_rewards[-1])
-        obs = env.reset()
-        player_relative = obs[0].observation["feature_screen"][
-          _PLAYER_RELATIVE]
-
-        screen = player_relative
-
-        group_list = common.init(env, obs)
-
-        # Select all marines first
-        #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
-        episode_rewards.append(0.0)
-
-        reset = True
-
-      if t > learning_starts and t % train_freq == 0:
-        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
-        if prioritized_replay:
-          experience = replay_buffer.sample(
-            batch_size, beta=beta_schedule.value(t))
-          (obses_t, actions, rewards, obses_tp1, dones, weights,
-           batch_idxes) = experience
-        else:
-          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
-            batch_size)
-          weights, batch_idxes = np.ones_like(rewards), None
-        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
-                          weights)
-        if prioritized_replay:
-          new_priorities = np.abs(td_errors) + prioritized_replay_eps
-          replay_buffer.update_priorities(batch_idxes,
-                                          new_priorities)
-
-      if t > learning_starts and t % target_network_update_freq == 0:
-        # Update target network periodically.
-        update_target()
-
-      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
-      num_episodes = len(episode_rewards)
-      if done and print_freq is not None and len(
-          episode_rewards) % print_freq == 0:
-        logger.record_tabular("steps", t)
-        logger.record_tabular("episodes", num_episodes)
-        logger.record_tabular("reward", reward)
-        logger.record_tabular("mean 100 episode reward",
-                              mean_100ep_reward)
-        logger.record_tabular("% time spent exploring",
-                              int(100 * exploration.value(t)))
-        logger.dump_tabular()
-
-      if (checkpoint_freq is not None and t > learning_starts
-          and num_episodes > 100 and t % checkpoint_freq == 0):
-        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
-          if print_freq is not None:
-            logger.log(
-              "Saving model due to mean reward increase: {} -> {}".
-                format(saved_mean_reward, mean_100ep_reward))
-          U.save_state(model_file)
-          model_saved = True
-          saved_mean_reward = mean_100ep_reward
-    if model_saved:
-      if print_freq is not None:
-        logger.log("Restored model with mean reward: {}".format(
-          saved_mean_reward))
-      U.load_state(model_file)
-
-  return ActWrapper(act)
+    """Train a deepq model.
+
+  Parameters
+  -------
+  env: pysc2.env.SC2Env
+      environment to train on
+  q_func: (tf.Variable, int, str, bool) -> tf.Variable
+      the model that takes the following inputs:
+          observation_in: object
+              the output of observation placeholder
+          num_actions: int
+              number of actions
+          scope: str
+          reuse: bool
+              should be passed to outer variable scope
+      and returns a tensor of shape (batch_size, num_actions) with values of every action.
+  lr: float
+      learning rate for adam optimizer
+  max_timesteps: int
+      number of env steps to optimizer for
+  buffer_size: int
+      size of the replay buffer
+  exploration_fraction: float
+      fraction of entire training period over which the exploration rate is annealed
+  exploration_final_eps: float
+      final value of random action probability
+  train_freq: int
+      update the model every `train_freq` steps.
+      set to None to disable printing
+  batch_size: int
+      size of a batched sampled from replay buffer for training
+  print_freq: int
+      how often to print out training progress
+      set to None to disable printing
+  checkpoint_freq: int
+      how often to save the model. This is so that the best version is restored
+      at the end of the training. If you do not wish to restore the best version at
+      the end of the training set this variable to None.
+  learning_starts: int
+      how many steps of the model to collect transitions for before learning starts
+  gamma: float
+      discount factor
+  target_network_update_freq: int
+      update the target network every `target_network_update_freq` steps.
+  prioritized_replay: True
+      if True prioritized replay buffer will be used.
+  prioritized_replay_alpha: float
+      alpha parameter for prioritized replay buffer
+  prioritized_replay_beta0: float
+      initial value of beta for prioritized replay buffer
+  prioritized_replay_beta_iters: int
+      number of iterations over which beta will be annealed from initial value
+      to 1.0. If set to None equals to max_timesteps.
+  prioritized_replay_eps: float
+      epsilon to add to the TD errors when updating priorities.
+  num_cpu: int
+      number of cpus to use for training
+  callback: (locals, globals) -> None
+      function called at every steps with state of the algorithm.
+      If callback returns true training stops.
+
+  Returns
+  -------
+  act: ActWrapper
+      Wrapper over act function. Adds ability to save it and load it.
+      See header of baselines/deepq/categorical.py for details on the act function.
+  """
+    # Create all the functions necessary to train the model
+
+    sess = U.make_session(num_cpu=num_cpu)
+    sess.__enter__()
+
+    def make_obs_ph(name):
+        return BatchInput((1, 32, 32), name=name)
+
+    act, train, update_target, debug = deepq.build_train(
+        make_obs_ph=make_obs_ph,
+        q_func=q_func,
+        num_actions=num_actions,
+        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
+        gamma=gamma,
+        grad_norm_clipping=10)
+    act_params = {
+        'make_obs_ph': make_obs_ph,
+        'q_func': q_func,
+        'num_actions': num_actions,
+    }
+
+    # Create the replay buffer
+    if prioritized_replay:
+        replay_buffer = PrioritizedReplayBuffer(
+            buffer_size, alpha=prioritized_replay_alpha)
+        if prioritized_replay_beta_iters is None:
+            prioritized_replay_beta_iters = max_timesteps
+        beta_schedule = LinearSchedule(
+            prioritized_replay_beta_iters,
+            initial_p=prioritized_replay_beta0,
+            final_p=1.0)
+    else:
+        replay_buffer = ReplayBuffer(buffer_size)
+        beta_schedule = None
+    # Create the schedule for exploration starting from 1.
+    exploration = LinearSchedule(
+        schedule_timesteps=int(exploration_fraction * max_timesteps),
+        initial_p=1.0,
+        final_p=exploration_final_eps)
+
+    # Initialize the parameters and copy them to the target network.
+    U.initialize()
+    update_target()
+
+    episode_rewards = [0.0]
+    saved_mean_reward = None
+
+    obs = env.reset()
+    # Select all marines first
+
+    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
+
+    screen = player_relative
+
+    obs, xy_per_marine = common.init(env, obs)
+
+    group_id = 0
+    reset = True
+    with tempfile.TemporaryDirectory() as td:
+        model_saved = False
+        model_file = os.path.join(td, "model")
+
+        for t in range(max_timesteps):
+            if callback is not None:
+                if callback(locals(), globals()):
+                    break
+            # Take action and update exploration to the newest value
+            kwargs = {}
+            if not param_noise:
+                update_eps = exploration.value(t)
+                update_param_noise_threshold = 0.
+            else:
+                update_eps = 0.
+                if param_noise_threshold >= 0.:
+                    update_param_noise_threshold = param_noise_threshold
+                else:
+                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
+                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
+                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
+                    # for detailed explanation.
+                    update_param_noise_threshold = -np.log(
+                        1. - exploration.value(t) +
+                        exploration.value(t) / float(num_actions))
+                kwargs['reset'] = reset
+                kwargs[
+                    'update_param_noise_threshold'] = update_param_noise_threshold
+                kwargs['update_param_noise_scale'] = True
+
+            # custom process for DefeatZerglingsAndBanelings
+
+            obs, screen, player = common.select_marine(env, obs)
+
+            action = act(
+                np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0]
+            reset = False
+            rew = 0
+
+            new_action = None
+
+            obs, new_action = common.marine_action(env, obs, player, action)
+            army_count = env._obs[0].observation.player_common.army_count
+
+            try:
+                if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
+                    obs = env.step(actions=new_action)
+                else:
+                    new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
+                    obs = env.step(actions=new_action)
+            except Exception as e:
+                print(e)
+                rew += -10
+                # 1  # Do nothing
+
+            player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
+            new_screen = player_relative
+
+            rew += obs[0].reward
+
+            done = obs[0].step_type == environment.StepType.LAST
+
+            selected = obs[0].observation["feature_screen"][_SELECTED]
+            player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
+
+            if (len(player_y) > 0):
+                player = [int(player_x.mean()), int(player_y.mean())]
+
+            if (len(player) == 2):
+
+                if (player[0] > 32):
+                    new_screen = common.shift(LEFT, player[0] - 32, new_screen)
+                elif (player[0] < 32):
+                    new_screen = common.shift(RIGHT, 32 - player[0],
+                                              new_screen)
+
+                if (player[1] > 32):
+                    new_screen = common.shift(UP, player[1] - 32, new_screen)
+                elif (player[1] < 32):
+                    new_screen = common.shift(DOWN, 32 - player[1], new_screen)
+
+            # Store transition in the replay buffer.
+            replay_buffer.add(screen, action, rew, new_screen, float(done))
+            screen = new_screen
+
+            episode_rewards[-1] += rew
+            reward = episode_rewards[-1]
+
+            if done:
+                print("Episode Reward : %s" % episode_rewards[-1])
+                obs = env.reset()
+                player_relative = obs[0].observation["feature_screen"][
+                    _PLAYER_RELATIVE]
+
+                screen = player_relative
+
+                group_list = common.init(env, obs)
+
+                # Select all marines first
+                # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
+                episode_rewards.append(0.0)
+
+                reset = True
+
+            if t > learning_starts and t % train_freq == 0:
+                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
+                if prioritized_replay:
+                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
+                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
+                else:
+                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
+                    weights, batch_idxes = np.ones_like(rewards), None
+                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
+                if prioritized_replay:
+                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
+                    replay_buffer.update_priorities(batch_idxes, new_priorities)
+
+            if t > learning_starts and t % target_network_update_freq == 0:
+                # Update target network periodically.
+                update_target()
+
+            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
+            num_episodes = len(episode_rewards)
+            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
+                logger.record_tabular("steps", t)
+                logger.record_tabular("episodes", num_episodes)
+                logger.record_tabular("reward", reward)
+                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
+                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
+                logger.dump_tabular()
+
+            if checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0:
+                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
+                    if print_freq is not None:
+                        logger.log("Saving model due to mean reward increase: {} -> {}".format(saved_mean_reward,
+                                                                                               mean_100ep_reward))
+                    U.save_state(model_file)
+                    model_saved = True
+                    saved_mean_reward = mean_100ep_reward
+        if model_saved:
+            if print_freq is not None:
+                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
+            U.load_state(model_file)
+
+    return ActWrapper(act)
diff --git a/train_defeat_zerglings.py b/train_defeat_zerglings.py
index bc3d8f7..5eb3d79 100644
--- a/train_defeat_zerglings.py
+++ b/train_defeat_zerglings.py
@@ -1,3 +1,4 @@
+import pprint
 import sys
 import os
 import datetime
@@ -140,7 +141,7 @@ def deepq_callback(locals, globals):
 
 def acktr_callback(locals, globals):
   global max_mean_reward, last_filename
-  #pprint.pprint(locals)
+  pprint.pprint(locals)
 
   if('mean_100ep_reward' in locals
      and locals['num_episodes'] >= 10

From 264b7435db92d7f53059bd4f446b808de1474280 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Thu, 25 Feb 2021 14:03:44 -0500
Subject: [PATCH 03/11] Reformatting whitespace in a2c.py

---
 a2c/a2c.py                        | 1378 ++++++++++++++---------------
 common/common.py                  |    4 +-
 common/vec_env/subproc_vec_env.py |    6 +-
 deepq_mineral_shards.py           |    3 +-
 defeat_zerglings/dqfd.py          |   13 +-
 train_defeat_zerglings.py         |    8 +-
 train_mineral_shards.py           |    2 +-
 7 files changed, 705 insertions(+), 709 deletions(-)

diff --git a/a2c/a2c.py b/a2c/a2c.py
index 2739292..5e2dbfa 100644
--- a/a2c/a2c.py
+++ b/a2c/a2c.py
@@ -24,606 +24,604 @@
 
 np.set_printoptions(threshold=np.inf)
 
+
 def mse(pred, target):
-  return tf.square(pred-target)/2.
+    return tf.square(pred - target) / 2.
+
 
 class Model(object):
-  def __init__(self,
-               policy,
-               ob_space,
-               ac_space,
-               nenvs,
-               total_timesteps,
-               nprocs=32,
-               nscripts=16,
-               nsteps=20,
-               nstack=4,
-               ent_coef=0.1,
-               vf_coef=0.5,
-               vf_fisher_coef=1.0,
-               lr=0.25,
-               max_grad_norm=0.001,
-               kfac_clip=0.001,
-               lrschedule='linear',
-               alpha=0.99,
-               epsilon=1e-5):
-    config = tf.ConfigProto(
-        allow_soft_placement=True,
-        intra_op_parallelism_threads=nprocs,
-        inter_op_parallelism_threads=nprocs)
-    config.gpu_options.allow_growth = True
-    self.sess = sess = tf.Session(config=config)
-    nsml.bind(sess=sess)
-    #nact = ac_space.n
-    nbatch = nenvs * nsteps
-    A = tf.placeholder(tf.int32, [nbatch])
-
-    XY0 = tf.placeholder(tf.int32, [nbatch])
-    XY1 = tf.placeholder(tf.int32, [nbatch])
-
-    # ADV == TD_TARGET - values
-    ADV = tf.placeholder(tf.float32, [nbatch])
-    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
-    PG_LR = tf.placeholder(tf.float32, [])
-    VF_LR = tf.placeholder(tf.float32, [])
-
-    self.model = step_model = policy(
-        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
-    self.model2 = train_model = policy(
-        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
-
-    # Policy 1 : Base Action : train_model.pi label = A
-
-    script_mask = tf.concat(
-        [
-            tf.zeros([nscripts * nsteps, 1]),
-            tf.ones([(nprocs - nscripts) * nsteps, 1])
-        ],
-        axis=0)
-
-    pi = train_model.pi
-    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
-    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
-    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits=pi, labels=A)
-    neglogpac *= tf.stop_gradient(pac_weight)
-
-    inv_A = 1.0 - tf.cast(A, tf.float32)
-
-    xy0_mask = tf.cast(A, tf.float32)
-    xy1_mask = tf.cast(A, tf.float32)
-
-    condition0 = tf.equal(xy0_mask, 2)
-    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
-    xy0_mask = 1.0 - xy0_mask
-
-    condition1 = tf.equal(xy1_mask, 2)
-    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)
-
-    # One hot representation of chosen marine.
-    # [batch_size, 2]
-    pi_xy0 = train_model.pi_xy0
-    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
-    pac_weight = tf.reduce_sum(
-        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
-
-    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits=pi_xy0, labels=XY0)
-    logpac_xy0 *= tf.stop_gradient(pac_weight)
-    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)
-
-    pi_xy1 = train_model.pi_xy1
-    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
-    pac_weight = tf.reduce_sum(
-        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
-
-    # 1D? 2D?
-    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits=pi_xy1, labels=XY1)
-    logpac_xy1 *= tf.stop_gradient(pac_weight)
-    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)
-
-    pg_loss = tf.reduce_mean(ADV * neglogpac)
-    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
-    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)
-
-    vf_ = tf.squeeze(train_model.vf)
-
-    vf_r = tf.concat(
-        [
-            tf.ones([nscripts * nsteps, 1]),
-            tf.zeros([(nprocs - nscripts) * nsteps, 1])
-        ],
-        axis=0) * TD_TARGET
-    vf_masked = vf_ * script_mask + vf_r
-
-    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]
-
-    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
-    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
-    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
-    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
-    entropy = entropy_a + entropy_xy0 + entropy_xy1
-
-    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
-
-    params = find_trainable_variables("model")
-    grads = tf.gradients(loss, params)
-    if max_grad_norm is not None:
-      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
-    grads = list(zip(grads, params))
-    trainer = tf.train.RMSPropOptimizer(
-        learning_rate=lr, decay=alpha, epsilon=epsilon)
-    _train = trainer.apply_gradients(grads)
-
-    self.logits = logits = train_model.pi
-
-    # xy0
-
-    self.params_common = params_common = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
-    self.params_xy0 = params_xy0 = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES,
-        scope='model/xy0') + params_common
-
-    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss
-
-    self.grads_check_xy0 = grads_xy0 = tf.gradients(
-        train_loss_xy0, params_xy0)
-    if max_grad_norm is not None:
-      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)
-
-    grads_xy0 = list(zip(grads_xy0, params_xy0))
-    trainer_xy0 = tf.train.RMSPropOptimizer(
-        learning_rate=lr, decay=alpha, epsilon=epsilon)
-    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)
-
-    # xy1
-
-    self.params_xy1 = params_xy1 = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES,
-        scope='model/xy1') + params_common
-
-    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss
-
-    self.grads_check_xy1 = grads_xy1 = tf.gradients(
-        train_loss_xy1, params_xy1)
-    if max_grad_norm is not None:
-      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)
-
-    grads_xy1 = list(zip(grads_xy1, params_xy1))
-    trainer_xy1 = tf.train.RMSPropOptimizer(
-        learning_rate=lr, decay=alpha, epsilon=epsilon)
-    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)
-
-    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
-
-    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
-      advs = td_targets - values
-      for step in range(len(obs)):
-        cur_lr = self.lr.value()
-
-      td_map = {
-          train_model.X: obs,
-          A: actions,
-          XY0: xy0,
-          XY1: xy1,
-          ADV: advs,
-          TD_TARGET: td_targets,
-          PG_LR: cur_lr
-      }
-      if states != []:
-        td_map[train_model.S] = states
-        td_map[train_model.M] = masks
-
-      policy_loss, value_loss, policy_entropy, _, \
-      policy_loss_xy0, policy_entropy_xy0, _, \
-      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
-          [pg_loss, vf_loss, entropy, _train,
-           pg_loss_xy0, entropy_xy0, _train_xy0,
-           pg_loss_xy1, entropy_xy1, _train_xy1],
-          td_map)
-      return policy_loss, value_loss, policy_entropy, \
-             policy_loss_xy0, policy_entropy_xy0, \
-             policy_loss_xy1, policy_entropy_xy1
-
-    def save(save_path):
-      ps = sess.run(params)
-      joblib.dump(ps, save_path)
-
-    def load(load_path):
-      loaded_params = joblib.load(load_path)
-      restores = []
-      for p, loaded_p in zip(params, loaded_params):
-        restores.append(p.assign(loaded_p))
-      sess.run(restores)
-
-    self.train = train
-    self.save = save
-    self.load = load
-    self.train_model = train_model
-    self.step_model = step_model
-    self.step = step_model.step
-    self.value = step_model.value
-    self.initial_state = step_model.initial_state
-    print("global_variables_initializer start")
-    tf.global_variables_initializer().run(session=sess)
-    print("global_variables_initializer complete")
+    def __init__(self,
+                 policy,
+                 ob_space,
+                 ac_space,
+                 nenvs,
+                 total_timesteps,
+                 nprocs=32,
+                 nscripts=16,
+                 nsteps=20,
+                 nstack=4,
+                 ent_coef=0.1,
+                 vf_coef=0.5,
+                 vf_fisher_coef=1.0,
+                 lr=0.25,
+                 max_grad_norm=0.001,
+                 kfac_clip=0.001,
+                 lrschedule='linear',
+                 alpha=0.99,
+                 epsilon=1e-5):
+        config = tf.ConfigProto(
+            allow_soft_placement=True,
+            intra_op_parallelism_threads=nprocs,
+            inter_op_parallelism_threads=nprocs)
+        config.gpu_options.allow_growth = True
+        self.sess = sess = tf.Session(config=config)
+        nsml.bind(sess=sess)
+        # nact = ac_space.n
+        nbatch = nenvs * nsteps
+        A = tf.placeholder(tf.int32, [nbatch])
+
+        XY0 = tf.placeholder(tf.int32, [nbatch])
+        XY1 = tf.placeholder(tf.int32, [nbatch])
+
+        # ADV == TD_TARGET - values
+        ADV = tf.placeholder(tf.float32, [nbatch])
+        TD_TARGET = tf.placeholder(tf.float32, [nbatch])
+        PG_LR = tf.placeholder(tf.float32, [])
+        VF_LR = tf.placeholder(tf.float32, [])
+
+        self.model = step_model = policy(
+            sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        self.model2 = train_model = policy(
+            sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
+
+        # Policy 1 : Base Action : train_model.pi label = A
+
+        script_mask = tf.concat(
+            [
+                tf.zeros([nscripts * nsteps, 1]),
+                tf.ones([(nprocs - nscripts) * nsteps, 1])
+            ],
+            axis=0)
+
+        pi = train_model.pi
+        pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
+        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=pi, labels=A)
+        neglogpac *= tf.stop_gradient(pac_weight)
+
+        inv_A = 1.0 - tf.cast(A, tf.float32)
+
+        xy0_mask = tf.cast(A, tf.float32)
+        xy1_mask = tf.cast(A, tf.float32)
+
+        condition0 = tf.equal(xy0_mask, 2)
+        xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
+        xy0_mask = 1.0 - xy0_mask
+
+        condition1 = tf.equal(xy1_mask, 2)
+        xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)
+
+        # One hot representation of chosen marine.
+        # [batch_size, 2]
+        pi_xy0 = train_model.pi_xy0
+        pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
+        pac_weight = tf.reduce_sum(
+            pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
+
+        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=pi_xy0, labels=XY0)
+        logpac_xy0 *= tf.stop_gradient(pac_weight)
+        logpac_xy0 *= tf.cast(xy0_mask, tf.float32)
+
+        pi_xy1 = train_model.pi_xy1
+        pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
+        pac_weight = tf.reduce_sum(
+            pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
+
+        # 1D? 2D?
+        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=pi_xy1, labels=XY1)
+        logpac_xy1 *= tf.stop_gradient(pac_weight)
+        logpac_xy1 *= tf.cast(xy1_mask, tf.float32)
+
+        pg_loss = tf.reduce_mean(ADV * neglogpac)
+        pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
+        pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)
+
+        vf_ = tf.squeeze(train_model.vf)
+
+        vf_r = tf.concat(
+            [
+                tf.ones([nscripts * nsteps, 1]),
+                tf.zeros([(nprocs - nscripts) * nsteps, 1])
+            ],
+            axis=0) * TD_TARGET
+        vf_masked = vf_ * script_mask + vf_r
+
+        # vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]
+
+        vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
+        entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
+        entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
+        entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
+        entropy = entropy_a + entropy_xy0 + entropy_xy1
+
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        params = find_trainable_variables("model")
+        grads = tf.gradients(loss, params)
+        if max_grad_norm is not None:
+            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.train.RMSPropOptimizer(
+            learning_rate=lr, decay=alpha, epsilon=epsilon)
+        _train = trainer.apply_gradients(grads)
+
+        self.logits = logits = train_model.pi
+
+        # xy0
+
+        self.params_common = params_common = tf.get_collection(
+            tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
+        self.params_xy0 = params_xy0 = tf.get_collection(
+            tf.GraphKeys.TRAINABLE_VARIABLES,
+            scope='model/xy0') + params_common
+
+        train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss
+
+        self.grads_check_xy0 = grads_xy0 = tf.gradients(
+            train_loss_xy0, params_xy0)
+        if max_grad_norm is not None:
+            grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)
+
+        grads_xy0 = list(zip(grads_xy0, params_xy0))
+        trainer_xy0 = tf.train.RMSPropOptimizer(
+            learning_rate=lr, decay=alpha, epsilon=epsilon)
+        _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)
+
+        # xy1
+
+        self.params_xy1 = params_xy1 = tf.get_collection(
+            tf.GraphKeys.TRAINABLE_VARIABLES,
+            scope='model/xy1') + params_common
+
+        train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss
+
+        self.grads_check_xy1 = grads_xy1 = tf.gradients(
+            train_loss_xy1, params_xy1)
+        if max_grad_norm is not None:
+            grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)
+
+        grads_xy1 = list(zip(grads_xy1, params_xy1))
+        trainer_xy1 = tf.train.RMSPropOptimizer(
+            learning_rate=lr, decay=alpha, epsilon=epsilon)
+        _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)
+
+        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+
+        def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
+            advs = td_targets - values
+            for step in range(len(obs)):
+                cur_lr = self.lr.value()
+
+            td_map = {
+                train_model.X: obs,
+                A: actions,
+                XY0: xy0,
+                XY1: xy1,
+                ADV: advs,
+                TD_TARGET: td_targets,
+                PG_LR: cur_lr
+            }
+            if states != []:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+
+            policy_loss, value_loss, policy_entropy, _, \
+            policy_loss_xy0, policy_entropy_xy0, _, \
+            policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
+                [pg_loss, vf_loss, entropy, _train,
+                 pg_loss_xy0, entropy_xy0, _train_xy0,
+                 pg_loss_xy1, entropy_xy1, _train_xy1],
+                td_map)
+            return policy_loss, value_loss, policy_entropy, \
+                   policy_loss_xy0, policy_entropy_xy0, \
+                   policy_loss_xy1, policy_entropy_xy1
+
+        def save(save_path):
+            ps = sess.run(params)
+            joblib.dump(ps, save_path)
+
+        def load(load_path):
+            loaded_params = joblib.load(load_path)
+            restores = []
+            for p, loaded_p in zip(params, loaded_params):
+                restores.append(p.assign(loaded_p))
+            sess.run(restores)
+
+        self.train = train
+        self.save = save
+        self.load = load
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.value = step_model.value
+        self.initial_state = step_model.initial_state
+        print("global_variables_initializer start")
+        tf.global_variables_initializer().run(session=sess)
+        print("global_variables_initializer complete")
 
 
 class Runner(object):
-  def __init__(self,
-               env,
-               model,
-               nsteps,
-               nscripts,
-               nstack,
-               gamma,
-               callback=None):
-    self.env = env
-    self.model = model
-    nh, nw, nc = (32, 32, 3)
-    self.nsteps = nsteps
-    self.nscripts = nscripts
-    self.nenv = nenv = env.num_envs
-    self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)
-    self.batch_coord_shape = (nenv * nsteps, 32)
-    self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
-    self.available_actions = None
-    self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8)
-    obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset(
-    )
-    self.xy_per_marine = [{"0":[0,0], "1":[0,0]} for _ in range(nenv)]
-    for env_num, data in enumerate(xy_per_marine):
-      self.xy_per_marine[env_num] = data
-    self.army_counts = army_counts
-    self.control_groups = control_groups
-    self.selected = selected
-    self.update_obs(obs)  # (2,13,32,32)
-    self.update_available(available_actions)
-    self.gamma = gamma
-    self.states = model.initial_state
-    self.dones = [False for _ in range(nenv)]
-    self.total_reward = [0.0 for _ in range(nenv)]
-    self.episode_rewards = []
-    self.episode_rewards_script = []
-    self.episode_rewards_a2c = []
-    self.episodes = 0
-    self.steps = 0
-    self.callback = callback
-
-    self.action_queue = [[] for _ in range(nenv)]
-    self.group_list = [[] for _ in range(nenv)]
-    self.agent_state = ["IDLE" for _ in range(nenv)]
-    self.dest_per_marine = [{} for _ in range(nenv)]
-
-    self.group_id = [0 for _ in range(nenv)]
-
-  def update_obs(self, obs):  # (self.nenv, 32, 32, 2)
-    obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
-    self.obs = np.roll(self.obs, shift=-3, axis=3)
-    new_map = np.zeros((self.nenv, 32, 32, 3))
-    new_map[:, :, :, -1] = obs[:, 0, :, :]
-    for env_num in range(self.nenv):
-      print("xy_per_marine: ", self.xy_per_marine)
-      if "0" not in self.xy_per_marine[env_num]:
-        self.xy_per_marine[env_num]["0"] = [0, 0]
-      if "1" not in self.xy_per_marine[env_num]:
-        self.xy_per_marine[env_num]["1"] = [0, 0]
-
-      marine0 = self.xy_per_marine[env_num]["0"]
-      marine1 = self.xy_per_marine[env_num]["1"]
-      new_map[env_num, marine0[0], marine0[1], -3] = 1
-      new_map[env_num, marine1[0], marine1[1], -2] = 1
-    self.obs[:, :, :, -3:] = new_map
-    # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32)
-
-  def update_available(self, _available_actions):
-    print("update_available : ", _available_actions)
-    self.available_actions = _available_actions
-    # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]])
-    self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8)
-    for env_num, list in enumerate(_available_actions):
-      print("env_num :", env_num, " list :", list)
-      for action_num in list:
-        print("action_num :", action_num)
-        if (action_num == 4):
-          self.base_act_mask[env_num][0] = 1
-          self.base_act_mask[env_num][1] = 1
-        elif action_num == 0:
-          self.base_act_mask[env_num][2] = 1
-          # elif(action_num == 331):
-          #   self.base_act_mask[env_num][2] = 1
-
-  def valid_base_action(self, base_actions):
-    for env_num, list in enumerate(self.available_actions):
-      avail = []
-      for action_num in list:
-        if (action_num == 4):
-          avail.append(0)
-          avail.append(1)
-        elif action_num == 0:
-          avail.append(2)
-          # elif(action_num == 331):
-          #   avail.append(2)
-
-      if base_actions[env_num] not in avail:
-        print("env_num", env_num, " argmax is not valid. random pick ",
-              avail)
-        base_actions[env_num] = np.random.choice(avail)
-
-    return base_actions
-
-  def trans_base_actions(self, base_actions):
-    new_base_actions = np.copy(base_actions)
-    for env_num, ba in enumerate(new_base_actions):
-      if (ba == 0):
-        new_base_actions[env_num] = 4  # move marine control group 0
-      elif (ba == 1):
-        new_base_actions[env_num] = 4  # move marine control group 1
-      elif (ba == 2):
-        new_base_actions[env_num] = 0  # move marine control group 1
-        # elif(ba==2):
-        #   new_base_actions[env_num] = 331 # move marine xy0
-
-    return new_base_actions
-
-
-  def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1):
-    actions = []
-    for env_num, spec in enumerate(base_action_spec):
-      # print("spec", spec.args)
-      args = []
-      # for arg_idx, arg in enumerate(spec.args):
-      #   #print("arg", arg)
-      #   #print("arg.id", arg.id)
-      #   if(arg.id==0): # screen (32,32) x0, y0
-      #     args.append([int(x0[env_num]), int(y0[env_num])])
-      #   # elif(arg.id==1): # minimap (32,32) x1, y1
-      #   #   args.append([int(x1[env_num]), int(y1[env_num])])
-      #   # elif(arg.id==2): # screen2 (32,32) x2, y2
-      #   #   args.append([int(x2[env_num]), y2[env_num]])
-      #   elif(arg.id==3): # pi3 queued (2)
-      #     args.append([int(0)])
-      #   elif(arg.id==4): # pi4 control_group_act (5)
-      #     args.append([_CONTROL_GROUP_RECALL])
-      #   elif(arg.id==5): # pi5 control_group_id 10
-      #     args.append([int(base_actions[env_num])]) # 0 => cg 0 / 1 => cg 1
-      #   # elif(arg.id==6): # pi6 select_point_act 4
-      #   #   args.append([int(sub6[env_num])])
-      #   # elif(arg.id==7): # pi7 select_add 2
-      #   #   args.append([int(sub7[env_num])])
-      #   # elif(arg.id==8): # pi8 select_unit_act 4
-      #   #   args.append([int(sub8[env_num])])
-      #   # elif(arg.id==9): # pi9 select_unit_id 500
-      #   #   args.append([int(sub9[env_num])])
-      #   # elif(arg.id==10): # pi10 select_worker 4
-      #   #   args.append([int(sub10[env_num])])
-      #   # elif(arg.id==11): # pi11 build_queue_id 10
-      #   #   args.append([int(sub11[env_num])])
-      #   # elif(arg.id==12): # pi12 unload_id 500
-      #   #   args.append([int(sub12[env_num])])
-      #   else:
-      #     raise NotImplementedError("cannot construct this arg", spec.args)
-      two_action = []
-      if base_actions[env_num] == 0:
-        two_action.append(
-            sc2_actions.FunctionCall(
-                4,
-                [[_CONTROL_GROUP_RECALL], [0]]
-                ))
-
-        two_action.append(
-            sc2_actions.FunctionCall(
-                331,
-                [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]]))
-
-      elif base_actions[env_num] == 1:
-        two_action.append(
-            sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]]))
-        two_action.append(
-            sc2_actions.FunctionCall(
-                331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]]))
-      elif base_actions[env_num] == 2:
-        two_action.append(
-            sc2_actions.FunctionCall(0, []))
-        two_action.append(
-            sc2_actions.FunctionCall(0, []))
-
-      #action = sc2_actions.FunctionCall(a, args)
-      actions.append(two_action)
-
-    return actions
-
-  def run(self):
-    mb_obs, mb_td_targets, mb_base_actions, \
-    mb_xy0, mb_xy1, \
-    mb_values, mb_dones \
-      = [], [], [], [], [], [], []
-
-    mb_states = self.states
-    for n in range(self.nsteps):
-      # pi, pi2, x1, y1, x2, y2, v0
-      pi1, pi_xy0, pi_xy1, values, states = self.model.step(
-          self.obs, self.states, self.dones)
-
-      pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3
-
-      base_actions = np.argmax(
-          pi1 * self.base_act_mask + pi1_noise, axis=1)
-      xy0 = np.argmax(pi_xy0, axis=1)
-
-      x0 = (xy0 % 32).astype(int)
-      y0 = (xy0 / 32).astype(int)
-
-      xy1 = np.argmax(pi_xy1, axis=1)
-      x1 = (xy1 % 32).astype(int)
-      y1 = (xy1 / 32).astype(int)
-
-      # Scripted Agent Hacking
-
-      for env_num in range(self.nenv):
-        if env_num >= self.nscripts:  # only for scripted agents
-          continue
-
-        ob = self.obs[env_num, :, :, :]
-        player_relative = ob[:, :, -1]
-
-        self.group_list[env_num] = common.update_group_list2(
-            self.control_groups[env_num])
-
-        if len(self.action_queue[env_num]) == 0:
-
-          self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \
-            common.solve_tsp(player_relative,
-                             self.selected[env_num][0],
-                             self.group_list[env_num],
-                             self.group_id[env_num],
-                             self.dest_per_marine[env_num],
-                             self.xy_per_marine[env_num])
-
-        base_actions[env_num] = 0
-        x0[env_num] = 0
-        y0[env_num] = 0
-        x1[env_num] = 0
-        y1[env_num] = 0
-
-        if len(self.action_queue[env_num]) > 0:
-          action = self.action_queue[env_num].pop(0)
-          base_actions[env_num] = action.get("base_action", 0)
-
-          x0[env_num] = action.get("x0", 0)
-          y0[env_num] = action.get("y0", 0)
-          xy0[env_num] = y0[env_num] * 32 + x0[env_num]
-
-          x1[env_num] = action.get("x1", 0)
-          y1[env_num] = action.get("y1", 0)
-          xy1[env_num] = y1[env_num] * 32 + x1[env_num]
-
-      base_actions = self.valid_base_action(base_actions)
-      new_base_actions = self.trans_base_actions(base_actions)
-
-      base_action_spec = self.env.action_spec(new_base_actions)
-      # print("base_actions:", base_actions)
-      actions = self.construct_action(
-          base_actions,
-          base_action_spec,
-          x0,
-          y0,
-          x1,
-          y1
-      )
-
-      mb_obs.append(np.copy(self.obs))
-      mb_base_actions.append(base_actions)
-
-      mb_xy0.append(xy0)
-      mb_xy1.append(xy1)
-      mb_values.append(values)
-      mb_dones.append(self.dones)
-
-      #print("final acitons : ", actions)
-      obs, rewards, dones,\
-      available_actions, army_counts,\
-      control_groups, selected, xy_per_marine\
-      = self.env.step(
-          actions=actions)
-      self.army_counts = army_counts
-      self.control_groups = control_groups
-      self.selected = selected
-      for env_num, data in enumerate(xy_per_marine):
-        self.xy_per_marine[env_num] = data
-      self.update_available(available_actions)
-
-      self.states = states
-      self.dones = dones
-      mean_100ep_reward_a2c = 0
-      for n, done in enumerate(dones):
-        self.total_reward[n] += float(rewards[n])
-        if done:
-          self.obs[n] = self.obs[n] * 0
-          self.episodes += 1
-          num_episodes = self.episodes
-          self.episode_rewards.append(self.total_reward[n])
-
-          model = self.model
-          mean_100ep_reward = round(
-              np.mean(self.episode_rewards[-101:]), 1)
-          if (n < self.nscripts):  # scripted agents
-            self.episode_rewards_script.append(
-                self.total_reward[n])
-            mean_100ep_reward_script = round(
-                np.mean(self.episode_rewards_script[-101:]), 1)
-            nsml.report(
-                reward_script=self.total_reward[n],
-                mean_reward_script=mean_100ep_reward_script,
-                reward=self.total_reward[n],
-                mean_100ep_reward=mean_100ep_reward,
-                episodes=self.episodes,
-                step=self.episodes,
-                scope=locals()
-            )
-          else:
-            self.episode_rewards_a2c.append(self.total_reward[n])
-            mean_100ep_reward_a2c = round(
-                np.mean(self.episode_rewards_a2c[-101:]), 1)
-            nsml.report(
-                reward_a2c=self.total_reward[n],
-                mean_reward_a2c=mean_100ep_reward_a2c,
-                reward=self.total_reward[n],
-                mean_100ep_reward=mean_100ep_reward,
-                episodes=self.episodes,
-                step=self.episodes,
-                scope=locals()
+    def __init__(self,
+                 env,
+                 model,
+                 nsteps,
+                 nscripts,
+                 nstack,
+                 gamma,
+                 callback=None):
+        self.env = env
+        self.model = model
+        nh, nw, nc = (32, 32, 3)
+        self.nsteps = nsteps
+        self.nscripts = nscripts
+        self.nenv = nenv = env.num_envs
+        self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)
+        self.batch_coord_shape = (nenv * nsteps, 32)
+        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
+        self.available_actions = None
+        self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8)
+        obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset()
+        self.xy_per_marine = [{"0": [0, 0], "1": [0, 0]} for _ in range(nenv)]
+        for env_num, data in enumerate(xy_per_marine):
+            self.xy_per_marine[env_num] = data
+        self.army_counts = army_counts
+        self.control_groups = control_groups
+        self.selected = selected
+        self.update_obs(obs)  # (2,13,32,32)
+        self.update_available(available_actions)
+        self.gamma = gamma
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+        self.total_reward = [0.0 for _ in range(nenv)]
+        self.episode_rewards = []
+        self.episode_rewards_script = []
+        self.episode_rewards_a2c = []
+        self.episodes = 0
+        self.steps = 0
+        self.callback = callback
+
+        self.action_queue = [[] for _ in range(nenv)]
+        self.group_list = [[] for _ in range(nenv)]
+        self.agent_state = ["IDLE" for _ in range(nenv)]
+        self.dest_per_marine = [{} for _ in range(nenv)]
+
+        self.group_id = [0 for _ in range(nenv)]
+
+    def update_obs(self, obs):  # (self.nenv, 32, 32, 2)
+        obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
+        self.obs = np.roll(self.obs, shift=-3, axis=3)
+        new_map = np.zeros((self.nenv, 32, 32, 3))
+        new_map[:, :, :, -1] = obs[:, 0, :, :]
+        for env_num in range(self.nenv):
+            # print("xy_per_marine: ", self.xy_per_marine)
+            if "0" not in self.xy_per_marine[env_num]:
+                self.xy_per_marine[env_num]["0"] = [0, 0]
+            if "1" not in self.xy_per_marine[env_num]:
+                self.xy_per_marine[env_num]["1"] = [0, 0]
+
+            marine0 = self.xy_per_marine[env_num]["0"]
+            marine1 = self.xy_per_marine[env_num]["1"]
+            new_map[env_num, marine0[0], marine0[1], -3] = 1
+            new_map[env_num, marine1[0], marine1[1], -2] = 1
+        self.obs[:, :, :, -3:] = new_map
+        # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32)
+
+    def update_available(self, _available_actions):
+        # print("update_available : ", _available_actions)
+        self.available_actions = _available_actions
+        # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]])
+        self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8)
+        for env_num, list in enumerate(_available_actions):
+            # print("env_num :", env_num, " list :", list)
+            for action_num in list:
+                # print("action_num :", action_num)
+                if (action_num == 4):
+                    self.base_act_mask[env_num][0] = 1
+                    self.base_act_mask[env_num][1] = 1
+                elif action_num == 0:
+                    self.base_act_mask[env_num][2] = 1
+                    # elif(action_num == 331):
+                    #   self.base_act_mask[env_num][2] = 1
+
+    def valid_base_action(self, base_actions):
+        for env_num, list in enumerate(self.available_actions):
+            avail = []
+            for action_num in list:
+                if (action_num == 4):
+                    avail.append(0)
+                    avail.append(1)
+                elif action_num == 0:
+                    avail.append(2)
+                    # elif(action_num == 331):
+                    #   avail.append(2)
+
+            if base_actions[env_num] not in avail:
+                # print("env_num", env_num, " argmax is not valid. random pick ", avail)
+                base_actions[env_num] = np.random.choice(avail)
+
+        return base_actions
+
+    def trans_base_actions(self, base_actions):
+        new_base_actions = np.copy(base_actions)
+        for env_num, ba in enumerate(new_base_actions):
+            if (ba == 0):
+                new_base_actions[env_num] = 4  # move marine control group 0
+            elif (ba == 1):
+                new_base_actions[env_num] = 4  # move marine control group 1
+            elif (ba == 2):
+                new_base_actions[env_num] = 0  # move marine control group 1
+                # elif(ba==2):
+                #   new_base_actions[env_num] = 331 # move marine xy0
+
+        return new_base_actions
+
+    def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1):
+        actions = []
+        for env_num, spec in enumerate(base_action_spec):
+            # print("spec", spec.args)
+            args = []
+            # for arg_idx, arg in enumerate(spec.args):
+            #   #print("arg", arg)
+            #   #print("arg.id", arg.id)
+            #   if(arg.id==0): # screen (32,32) x0, y0
+            #     args.append([int(x0[env_num]), int(y0[env_num])])
+            #   # elif(arg.id==1): # minimap (32,32) x1, y1
+            #   #   args.append([int(x1[env_num]), int(y1[env_num])])
+            #   # elif(arg.id==2): # screen2 (32,32) x2, y2
+            #   #   args.append([int(x2[env_num]), y2[env_num]])
+            #   elif(arg.id==3): # pi3 queued (2)
+            #     args.append([int(0)])
+            #   elif(arg.id==4): # pi4 control_group_act (5)
+            #     args.append([_CONTROL_GROUP_RECALL])
+            #   elif(arg.id==5): # pi5 control_group_id 10
+            #     args.append([int(base_actions[env_num])]) # 0 => cg 0 / 1 => cg 1
+            #   # elif(arg.id==6): # pi6 select_point_act 4
+            #   #   args.append([int(sub6[env_num])])
+            #   # elif(arg.id==7): # pi7 select_add 2
+            #   #   args.append([int(sub7[env_num])])
+            #   # elif(arg.id==8): # pi8 select_unit_act 4
+            #   #   args.append([int(sub8[env_num])])
+            #   # elif(arg.id==9): # pi9 select_unit_id 500
+            #   #   args.append([int(sub9[env_num])])
+            #   # elif(arg.id==10): # pi10 select_worker 4
+            #   #   args.append([int(sub10[env_num])])
+            #   # elif(arg.id==11): # pi11 build_queue_id 10
+            #   #   args.append([int(sub11[env_num])])
+            #   # elif(arg.id==12): # pi12 unload_id 500
+            #   #   args.append([int(sub12[env_num])])
+            #   else:
+            #     raise NotImplementedError("cannot construct this arg", spec.args)
+            two_action = []
+            if base_actions[env_num] == 0:
+                two_action.append(
+                    sc2_actions.FunctionCall(
+                        4,
+                        [[_CONTROL_GROUP_RECALL], [0]]
+                    ))
+
+                two_action.append(
+                    sc2_actions.FunctionCall(
+                        331,
+                        [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]]))
+
+            elif base_actions[env_num] == 1:
+                two_action.append(
+                    sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]]))
+                two_action.append(
+                    sc2_actions.FunctionCall(
+                        331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]]))
+            elif base_actions[env_num] == 2:
+                two_action.append(
+                    sc2_actions.FunctionCall(0, []))
+                two_action.append(
+                    sc2_actions.FunctionCall(0, []))
+
+            # action = sc2_actions.FunctionCall(a, args)
+            actions.append(two_action)
+
+        return actions
+
+    def run(self):
+        mb_obs, mb_td_targets, mb_base_actions, \
+        mb_xy0, mb_xy1, \
+        mb_values, mb_dones \
+            = [], [], [], [], [], [], []
+
+        mb_states = self.states
+        for n in range(self.nsteps):
+            # pi, pi2, x1, y1, x2, y2, v0
+            pi1, pi_xy0, pi_xy1, values, states = self.model.step(
+                self.obs, self.states, self.dones)
+
+            pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3
+
+            base_actions = np.argmax(
+                pi1 * self.base_act_mask + pi1_noise, axis=1)
+            xy0 = np.argmax(pi_xy0, axis=1)
+
+            x0 = (xy0 % 32).astype(int)
+            y0 = (xy0 / 32).astype(int)
+
+            xy1 = np.argmax(pi_xy1, axis=1)
+            x1 = (xy1 % 32).astype(int)
+            y1 = (xy1 / 32).astype(int)
+
+            # Scripted Agent Hacking
+
+            for env_num in range(self.nenv):
+                if env_num >= self.nscripts:  # only for scripted agents
+                    continue
+
+                ob = self.obs[env_num, :, :, :]
+                player_relative = ob[:, :, -1]
+
+                self.group_list[env_num] = common.update_group_list2(
+                    self.control_groups[env_num])
+
+                if len(self.action_queue[env_num]) == 0:
+                    self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], \
+                    self.xy_per_marine[env_num] = \
+                        common.solve_tsp(player_relative,
+                                         self.selected[env_num][0],
+                                         self.group_list[env_num],
+                                         self.group_id[env_num],
+                                         self.dest_per_marine[env_num],
+                                         self.xy_per_marine[env_num])
+
+                base_actions[env_num] = 0
+                x0[env_num] = 0
+                y0[env_num] = 0
+                x1[env_num] = 0
+                y1[env_num] = 0
+
+                if len(self.action_queue[env_num]) > 0:
+                    action = self.action_queue[env_num].pop(0)
+                    base_actions[env_num] = action.get("base_action", 0)
+
+                    x0[env_num] = action.get("x0", 0)
+                    y0[env_num] = action.get("y0", 0)
+                    xy0[env_num] = y0[env_num] * 32 + x0[env_num]
+
+                    x1[env_num] = action.get("x1", 0)
+                    y1[env_num] = action.get("y1", 0)
+                    xy1[env_num] = y1[env_num] * 32 + x1[env_num]
+
+            base_actions = self.valid_base_action(base_actions)
+            new_base_actions = self.trans_base_actions(base_actions)
+
+            base_action_spec = self.env.action_spec(new_base_actions)
+            # print("base_actions:", base_actions)
+            actions = self.construct_action(
+                base_actions,
+                base_action_spec,
+                x0,
+                y0,
+                x1,
+                y1
             )
-            print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)
 
-          if self.callback is not None:
-            self.callback(locals(), globals())
-          self.total_reward[n] = 0
-          self.group_list[n] = []
-
-
-      self.update_obs(obs)
-      mb_td_targets.append(rewards)
-    mb_dones.append(self.dones)
-    #batch of steps to batch of rollouts
-    mb_obs = np.asarray(
-        mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
+            mb_obs.append(np.copy(self.obs))
+            mb_base_actions.append(base_actions)
+
+            mb_xy0.append(xy0)
+            mb_xy1.append(xy1)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+
+            # print("final acitons : ", actions)
+            obs, rewards, dones, \
+            available_actions, army_counts, \
+            control_groups, selected, xy_per_marine \
+                = self.env.step(
+                actions=actions)
+            self.army_counts = army_counts
+            self.control_groups = control_groups
+            self.selected = selected
+            for env_num, data in enumerate(xy_per_marine):
+                self.xy_per_marine[env_num] = data
+            self.update_available(available_actions)
+
+            self.states = states
+            self.dones = dones
+            mean_100ep_reward_a2c = 0
+            for n, done in enumerate(dones):
+                self.total_reward[n] += float(rewards[n])
+                if done:
+                    self.obs[n] = self.obs[n] * 0
+                    self.episodes += 1
+                    num_episodes = self.episodes
+                    self.episode_rewards.append(self.total_reward[n])
+
+                    model = self.model
+                    mean_100ep_reward = round(
+                        np.mean(self.episode_rewards[-101:]), 1)
+                    if (n < self.nscripts):  # scripted agents
+                        self.episode_rewards_script.append(
+                            self.total_reward[n])
+                        mean_100ep_reward_script = round(
+                            np.mean(self.episode_rewards_script[-101:]), 1)
+                        nsml.report(
+                            reward_script=self.total_reward[n],
+                            mean_reward_script=mean_100ep_reward_script,
+                            reward=self.total_reward[n],
+                            mean_100ep_reward=mean_100ep_reward,
+                            episodes=self.episodes,
+                            step=self.episodes,
+                            scope=locals()
+                        )
+                    else:
+                        self.episode_rewards_a2c.append(self.total_reward[n])
+                        mean_100ep_reward_a2c = round(
+                            np.mean(self.episode_rewards_a2c[-101:]), 1)
+                        nsml.report(
+                            reward_a2c=self.total_reward[n],
+                            mean_reward_a2c=mean_100ep_reward_a2c,
+                            reward=self.total_reward[n],
+                            mean_100ep_reward=mean_100ep_reward,
+                            episodes=self.episodes,
+                            step=self.episodes,
+                            scope=locals()
+                        )
+                        print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)
+
+                    if self.callback is not None:
+                        self.callback(locals(), globals())
+                    self.total_reward[n] = 0
+                    self.group_list[n] = []
+
+            self.update_obs(obs)
+            mb_td_targets.append(rewards)
+        mb_dones.append(self.dones)
+        # batch of steps to batch of rollouts
+        mb_obs = np.asarray(
+            mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
             self.batch_ob_shape)
-    mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
-    mb_base_actions = np.asarray(
-        mb_base_actions, dtype=np.int32).swapaxes(1, 0)
-
-    mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
-    mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)
-
-    mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
-    mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
-    mb_masks = mb_dones[:, :-1]
-    mb_dones = mb_dones[:, 1:]
-    last_values = self.model.value(self.obs, self.states,
-                                   self.dones).tolist()
-    #discount/bootstrap off value fn
-    for n, (rewards, dones, value) in enumerate(
-        zip(mb_td_targets, mb_dones, last_values)):
-      rewards = rewards.tolist()
-      dones = dones.tolist()
-      if dones[-1] == 0:
-        rewards = discount_with_dones(rewards + [value], dones + [0],
-                                      self.gamma)[:-1]
-      else:
-        rewards = discount_with_dones(rewards, dones, self.gamma)
-      mb_td_targets[n] = rewards
-    mb_td_targets = mb_td_targets.flatten()
-    mb_base_actions = mb_base_actions.flatten()
-    mb_xy0 = mb_xy0.flatten()
-    mb_xy1 = mb_xy1.flatten()
-
-    mb_values = mb_values.flatten()
-    mb_masks = mb_masks.flatten()
-    return mb_obs, mb_states, mb_td_targets, mb_masks, \
-           mb_base_actions, mb_xy0, mb_xy1, mb_values
+        mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
+        mb_base_actions = np.asarray(
+            mb_base_actions, dtype=np.int32).swapaxes(1, 0)
+
+        mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
+        mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)
+
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+        last_values = self.model.value(self.obs, self.states,
+                                       self.dones).tolist()
+        # discount/bootstrap off value fn
+        for n, (rewards, dones, value) in enumerate(
+                zip(mb_td_targets, mb_dones, last_values)):
+            rewards = rewards.tolist()
+            dones = dones.tolist()
+            if dones[-1] == 0:
+                rewards = discount_with_dones(rewards + [value], dones + [0],
+                                              self.gamma)[:-1]
+            else:
+                rewards = discount_with_dones(rewards, dones, self.gamma)
+            mb_td_targets[n] = rewards
+        mb_td_targets = mb_td_targets.flatten()
+        mb_base_actions = mb_base_actions.flatten()
+        mb_xy0 = mb_xy0.flatten()
+        mb_xy1 = mb_xy1.flatten()
+
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_td_targets, mb_masks, \
+               mb_base_actions, mb_xy0, mb_xy1, mb_values
 
 
 def learn(policy,
@@ -645,102 +643,102 @@ def learn(policy,
           save_interval=None,
           lrschedule='linear',
           callback=None):
-  tf.reset_default_graph()
-  set_global_seeds(seed)
-
-  nenvs = nprocs
-  ob_space = (32, 32, 3)  # env.observation_space
-  ac_space = (32, 32)
-  make_model = lambda: Model(policy, ob_space, ac_space, nenvs,
-                              total_timesteps,
-                              nprocs=nprocs,
-                              nscripts=nscripts,
-                              nsteps=nsteps,
-                              nstack=nstack,
-                              ent_coef=ent_coef,
-                              vf_coef=vf_coef,
-                              vf_fisher_coef=vf_fisher_coef,
-                              lr=lr,
-                              max_grad_norm=max_grad_norm,
-                              kfac_clip=kfac_clip,
-                              lrschedule=lrschedule)
-
-  if save_interval and logger.get_dir():
-    import cloudpickle
-    with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
-      fh.write(cloudpickle.dumps(make_model))
-  model = make_model()
-  print("make_model complete!")
-  runner = Runner(
-      env,
-      model,
-      nsteps=nsteps,
-      nscripts=nscripts,
-      nstack=nstack,
-      gamma=gamma,
-      callback=callback)
-  nbatch = nenvs * nsteps
-  tstart = time.time()
-  # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
-  for update in range(1, total_timesteps // nbatch + 1):
-    obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run()
-
-    policy_loss, value_loss, policy_entropy, \
-    policy_loss_xy0, policy_entropy_xy0, \
-    policy_loss_xy1, policy_entropy_xy1, \
-      = model.train(obs, states, td_targets,
-                    masks, actions,
-                    xy0, xy1, values)
-
-    model.old_obs = obs
-    nseconds = time.time() - tstart
-    fps = int((update * nbatch) / nseconds)
-    if update % log_interval == 0 or update == 1:
-      ev = explained_variance(values, td_targets)
-      # nsml.report(
-      #     nupdates=update,
-      #     total_timesteps=update * nbatch,
-      #     fps=fps,
-      #     policy_entropy=float(policy_entropy),
-      #     policy_loss=float(policy_loss),
-
-      #     policy_loss_xy0=float(policy_loss_xy0),
-      #     policy_entropy_xy0=float(policy_entropy_xy0),
-
-      #     policy_loss_xy1=float(policy_loss_xy1),
-      #     policy_entropy_xy1=float(policy_entropy_xy1),
-
-      #     value_loss=float(value_loss),
-      #     explained_variance=float(ev),
-
-      #     batch_size=nbatch,
-      #     step=update,
-
-      #     scope=locals()
-      #     )
-      # logger.record_tabular("nupdates", update)
-      # logger.record_tabular("total_timesteps", update * nbatch)
-      # logger.record_tabular("fps", fps)
-      # logger.record_tabular("policy_entropy", float(policy_entropy))
-      # logger.record_tabular("policy_loss", float(policy_loss))
-
-      # logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0))
-      # logger.record_tabular("policy_entropy_xy0",
-      #                       float(policy_entropy_xy0))
-      # logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1))
-      # logger.record_tabular("policy_entropy_xy1",
-      #                       float(policy_entropy_xy1))
-      # # logger.record_tabular("policy_loss_y0", float(policy_loss_y0))
-      # # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0))
-
-      # logger.record_tabular("value_loss", float(value_loss))
-      # logger.record_tabular("explained_variance", float(ev))
-      # logger.dump_tabular()
-
-    if save_interval and (update % save_interval == 0
-                          or update == 1) and logger.get_dir():
-      savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
-      print('Saving to', savepath)
-      model.save(savepath)
-
-  env.close()
+    tf.reset_default_graph()
+    set_global_seeds(seed)
+
+    nenvs = nprocs
+    ob_space = (32, 32, 3)  # env.observation_space
+    ac_space = (32, 32)
+    make_model = lambda: Model(policy, ob_space, ac_space, nenvs,
+                               total_timesteps,
+                               nprocs=nprocs,
+                               nscripts=nscripts,
+                               nsteps=nsteps,
+                               nstack=nstack,
+                               ent_coef=ent_coef,
+                               vf_coef=vf_coef,
+                               vf_fisher_coef=vf_fisher_coef,
+                               lr=lr,
+                               max_grad_norm=max_grad_norm,
+                               kfac_clip=kfac_clip,
+                               lrschedule=lrschedule)
+
+    if save_interval and logger.get_dir():
+        import cloudpickle
+        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
+            fh.write(cloudpickle.dumps(make_model))
+    model = make_model()
+    print("make_model complete!")
+    runner = Runner(
+        env,
+        model,
+        nsteps=nsteps,
+        nscripts=nscripts,
+        nstack=nstack,
+        gamma=gamma,
+        callback=callback)
+    nbatch = nenvs * nsteps
+    tstart = time.time()
+    # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
+    for update in range(1, total_timesteps // nbatch + 1):
+        obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run()
+
+        policy_loss, value_loss, policy_entropy, \
+        policy_loss_xy0, policy_entropy_xy0, \
+        policy_loss_xy1, policy_entropy_xy1, \
+            = model.train(obs, states, td_targets,
+                          masks, actions,
+                          xy0, xy1, values)
+
+        model.old_obs = obs
+        nseconds = time.time() - tstart
+        fps = int((update * nbatch) / nseconds)
+        if update % log_interval == 0 or update == 1:
+            ev = explained_variance(values, td_targets)
+            # nsml.report(
+            #     nupdates=update,
+            #     total_timesteps=update * nbatch,
+            #     fps=fps,
+            #     policy_entropy=float(policy_entropy),
+            #     policy_loss=float(policy_loss),
+
+            #     policy_loss_xy0=float(policy_loss_xy0),
+            #     policy_entropy_xy0=float(policy_entropy_xy0),
+
+            #     policy_loss_xy1=float(policy_loss_xy1),
+            #     policy_entropy_xy1=float(policy_entropy_xy1),
+
+            #     value_loss=float(value_loss),
+            #     explained_variance=float(ev),
+
+            #     batch_size=nbatch,
+            #     step=update,
+
+            #     scope=locals()
+            #     )
+            # logger.record_tabular("nupdates", update)
+            # logger.record_tabular("total_timesteps", update * nbatch)
+            # logger.record_tabular("fps", fps)
+            # logger.record_tabular("policy_entropy", float(policy_entropy))
+            # logger.record_tabular("policy_loss", float(policy_loss))
+
+            # logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0))
+            # logger.record_tabular("policy_entropy_xy0",
+            #                       float(policy_entropy_xy0))
+            # logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1))
+            # logger.record_tabular("policy_entropy_xy1",
+            #                       float(policy_entropy_xy1))
+            # # logger.record_tabular("policy_loss_y0", float(policy_loss_y0))
+            # # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0))
+
+            # logger.record_tabular("value_loss", float(value_loss))
+            # logger.record_tabular("explained_variance", float(ev))
+            # logger.dump_tabular()
+
+        if save_interval and (update % save_interval == 0
+                              or update == 1) and logger.get_dir():
+            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
+            print('Saving to', savepath)
+            model.save(savepath)
+
+    env.close()
diff --git a/common/common.py b/common/common.py
index 73dbd9f..1cda3c6 100644
--- a/common/common.py
+++ b/common/common.py
@@ -420,12 +420,12 @@ def check_group_list(env, obs):
         if group[0] == 48:
             army_count += group[1]
             if group[1] != 1:
-                print("group error group_id : %s count : %s" % (id, group[1]))
+                # print("group error group_id : %s count : %s" % (id, group[1]))
                 error = True
                 return error
     if army_count != env._obs[0].observation.player_common.army_count:
         error = True
-        print("army_count %s !=  %s env._obs.observation.player_common.army_count " % (army_count, env._obs[0].observation.player_common.army_count))
+        # print("army_count %s !=  %s env._obs.observation.player_common.army_count " % (army_count, env._obs[0].observation.player_common.army_count))
 
     return error
 
diff --git a/common/vec_env/subproc_vec_env.py b/common/vec_env/subproc_vec_env.py
index 16fcb99..56da9a0 100644
--- a/common/vec_env/subproc_vec_env.py
+++ b/common/vec_env/subproc_vec_env.py
@@ -42,9 +42,9 @@ def worker(remote, map_name, nscripts, i):
         action1 = data[0][0]
         action2 = data[0][1]
         func = actions.FUNCTIONS[action1[0]]
-        print("agent(",i," ) action : ", action1, " func : ", func)
+        # print("agent(",i," ) action : ", action1, " func : ", func)
         func = actions.FUNCTIONS[action2[0]]
-        print("agent(",i," ) action : ", action2, " func : ", func)
+        # print("agent(",i," ) action : ", action2, " func : ", func)
 
 
         result = env.step(actions=[action1])
@@ -55,7 +55,7 @@ def worker(remote, map_name, nscripts, i):
 
         if len(action2[1]) == 2:
           x, y = action2[1][1]
-          print("x, y:", x, y)
+          # print("x, y:", x, y)
 
           if x == 0 and y == 0:
             move = False
diff --git a/deepq_mineral_shards.py b/deepq_mineral_shards.py
index d01cf01..ba231e7 100644
--- a/deepq_mineral_shards.py
+++ b/deepq_mineral_shards.py
@@ -327,7 +327,7 @@ def make_obs_ph(name):
 
       rew = obs[0].reward
 
-      done = obs[0].step_type == environment.StepType.LAST
+      done = obs[0].step_type == environment.StepType.LAST or obs[0].step_type == environment.StepType.FIRST
 
       # Store transition in the replay buffer.
       replay_buffer_x.add(screen, action_x, rew, new_screen, float(done))
@@ -349,7 +349,6 @@ def make_obs_ph(name):
         # Select all marines first
         env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
         episode_rewards.append(0.0)
-        #episode_minerals.append(0.0)
 
         reset = True
 
diff --git a/defeat_zerglings/dqfd.py b/defeat_zerglings/dqfd.py
index 0f1aee7..7a40701 100644
--- a/defeat_zerglings/dqfd.py
+++ b/defeat_zerglings/dqfd.py
@@ -115,15 +115,15 @@ def learn(env,
           q_func,
           num_actions=3,
           lr=5e-4,
-          max_timesteps=100000,
-          buffer_size=50000,
+          max_timesteps=10000,
+          buffer_size=5000,
           exploration_fraction=0.1,
           exploration_final_eps=0.02,
           train_freq=1,
           batch_size=32,
           print_freq=1,
-          checkpoint_freq=10000,
-          learning_starts=1000,
+          checkpoint_freq=1000,
+          learning_starts=100,
           gamma=1.0,
           target_network_update_freq=500,
           prioritized_replay=False,
@@ -352,8 +352,7 @@ def make_obs_ph(name):
             if done:
                 print("Episode Reward : %s" % episode_rewards[-1])
                 obs = env.reset()
-                player_relative = obs[0].observation["feature_screen"][
-                    _PLAYER_RELATIVE]
+                player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
 
                 screen = player_relative
 
@@ -373,7 +372,7 @@ def make_obs_ph(name):
                 else:
                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                     weights, batch_idxes = np.ones_like(rewards), None
-                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
+                td_errors = train(np.expand_dims(obses_t, axis=1), actions, rewards, np.expand_dims(obses_tp1, axis=1), dones, weights)
                 if prioritized_replay:
                     new_priorities = np.abs(td_errors) + prioritized_replay_eps
                     replay_buffer.update_priorities(batch_idxes, new_priorities)
diff --git a/train_defeat_zerglings.py b/train_defeat_zerglings.py
index 5eb3d79..279e2ba 100644
--- a/train_defeat_zerglings.py
+++ b/train_defeat_zerglings.py
@@ -92,13 +92,13 @@ def main():
       q_func=model,
       num_actions=3,
       lr=1e-4,
-      max_timesteps=10000000,
-      buffer_size=100000,
+      max_timesteps=1000000,
+      buffer_size=10000,
       exploration_fraction=0.5,
       exploration_final_eps=0.01,
       train_freq=2,
-      learning_starts=100000,
-      target_network_update_freq=1000,
+      learning_starts=10000,
+      target_network_update_freq=100,
       gamma=0.99,
       prioritized_replay=True,
       callback=deepq_callback
diff --git a/train_mineral_shards.py b/train_mineral_shards.py
index 3661718..ddbd78d 100644
--- a/train_mineral_shards.py
+++ b/train_mineral_shards.py
@@ -33,7 +33,7 @@
 flags.DEFINE_boolean("prioritized", True, "prioritized_replay")
 flags.DEFINE_boolean("dueling", True, "dueling")
 flags.DEFINE_float("lr", 0.0005, "Learning rate")
-flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C")
+flags.DEFINE_integer("num_agents", 6, "number of RL agents for A2C")
 flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C")
 flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C")
 

From 39a68926a7e72e18067409f12d8da98621a9d6ae Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Thu, 25 Feb 2021 16:17:30 -0500
Subject: [PATCH 04/11] Pulling out a2c functionality for readability and
 debugging.

---
 a2c/a2c.py                  |  48 +-------------
 train_mineral_shards.py     |  10 +--
 train_mineral_shards_a2c.py | 129 ++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+), 54 deletions(-)
 create mode 100644 train_mineral_shards_a2c.py

diff --git a/a2c/a2c.py b/a2c/a2c.py
index 5e2dbfa..68afcc4 100644
--- a/a2c/a2c.py
+++ b/a2c/a2c.py
@@ -683,57 +683,11 @@ def learn(policy,
     for update in range(1, total_timesteps // nbatch + 1):
         obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run()
 
-        policy_loss, value_loss, policy_entropy, \
-        policy_loss_xy0, policy_entropy_xy0, \
-        policy_loss_xy1, policy_entropy_xy1, \
-            = model.train(obs, states, td_targets,
-                          masks, actions,
-                          xy0, xy1, values)
+        policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values)
 
         model.old_obs = obs
         nseconds = time.time() - tstart
         fps = int((update * nbatch) / nseconds)
-        if update % log_interval == 0 or update == 1:
-            ev = explained_variance(values, td_targets)
-            # nsml.report(
-            #     nupdates=update,
-            #     total_timesteps=update * nbatch,
-            #     fps=fps,
-            #     policy_entropy=float(policy_entropy),
-            #     policy_loss=float(policy_loss),
-
-            #     policy_loss_xy0=float(policy_loss_xy0),
-            #     policy_entropy_xy0=float(policy_entropy_xy0),
-
-            #     policy_loss_xy1=float(policy_loss_xy1),
-            #     policy_entropy_xy1=float(policy_entropy_xy1),
-
-            #     value_loss=float(value_loss),
-            #     explained_variance=float(ev),
-
-            #     batch_size=nbatch,
-            #     step=update,
-
-            #     scope=locals()
-            #     )
-            # logger.record_tabular("nupdates", update)
-            # logger.record_tabular("total_timesteps", update * nbatch)
-            # logger.record_tabular("fps", fps)
-            # logger.record_tabular("policy_entropy", float(policy_entropy))
-            # logger.record_tabular("policy_loss", float(policy_loss))
-
-            # logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0))
-            # logger.record_tabular("policy_entropy_xy0",
-            #                       float(policy_entropy_xy0))
-            # logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1))
-            # logger.record_tabular("policy_entropy_xy1",
-            #                       float(policy_entropy_xy1))
-            # # logger.record_tabular("policy_loss_y0", float(policy_loss_y0))
-            # # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0))
-
-            # logger.record_tabular("value_loss", float(value_loss))
-            # logger.record_tabular("explained_variance", float(ev))
-            # logger.dump_tabular()
 
         if save_interval and (update % save_interval == 0
                               or update == 1) and logger.get_dir():
diff --git a/train_mineral_shards.py b/train_mineral_shards.py
index ddbd78d..005377e 100644
--- a/train_mineral_shards.py
+++ b/train_mineral_shards.py
@@ -183,8 +183,7 @@ def deepq_callback(locals, globals):
 
   global max_mean_reward, last_filename
   if 'done' in locals and locals['done'] == True:
-    if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10\
-        and locals['mean_100ep_reward'] > max_mean_reward:
+    if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward'] > max_mean_reward:
       print("mean_100ep_reward : %s max_mean_reward : %s" %
             (locals['mean_100ep_reward'], max_mean_reward))
 
@@ -258,8 +257,7 @@ def deepq_4way_callback(locals, globals):
 def a2c_callback(locals, globals):
   global max_mean_reward, last_filename
 
-  if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10\
-      and locals['mean_100ep_reward'] > max_mean_reward:
+  if 'mean_100ep_reward' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward'] > max_mean_reward:
     print("mean_100ep_reward : %s max_mean_reward : %s" %
           (locals['mean_100ep_reward'], max_mean_reward))
 
@@ -280,9 +278,7 @@ def a2c_callback(locals, globals):
     max_mean_reward = locals['mean_100ep_reward']
     model = locals['model']
 
-    filename = os.path.join(
-        PROJ_DIR,
-        'models/a2c/mineral_%s.pkl' % locals['mean_100ep_reward'])
+    filename = os.path.join(PROJ_DIR, 'models/a2c/mineral_%s.pkl' % locals['mean_100ep_reward'])
     model.save(filename)
     print("save best mean_100ep_reward model to %s" % filename)
     last_filename = filename
diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py
new file mode 100644
index 0000000..ba3e90f
--- /dev/null
+++ b/train_mineral_shards_a2c.py
@@ -0,0 +1,129 @@
+import sys
+import os
+import datetime
+import random
+
+from absl import flags
+
+from pysc2.env import sc2_env
+from pysc2.lib import actions
+from baselines_legacy import cnn_to_mlp, BatchInput
+from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat
+
+from common.vec_env.subproc_vec_env import SubprocVecEnv
+from a2c.policies import CnnPolicy
+from a2c import a2c
+import deepq_mineral_4way
+import deepq_mineral_shards
+
+_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id
+_SELECT_ARMY = actions.FUNCTIONS.select_army.id
+_SELECT_ALL = [0]
+_NOT_QUEUED = [0]
+
+step_mul = 8
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("map", "CollectMineralShards",
+                    "Name of a map to use to play.")
+start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
+flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)")
+flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.")
+flags.DEFINE_integer("timesteps", 2000000, "Steps to train")
+flags.DEFINE_float("exploration_fraction", 0.5, "Exploration Fraction")
+flags.DEFINE_boolean("prioritized", True, "prioritized_replay")
+flags.DEFINE_boolean("dueling", True, "dueling")
+flags.DEFINE_float("lr", 0.0005, "Learning rate")
+flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C")
+flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C")
+flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C")
+
+PROJ_DIR = os.path.dirname(os.path.abspath(__file__))
+
+max_mean_reward = 0
+last_filename = ""
+logdir = ""
+
+
+def main():
+    FLAGS(sys.argv)
+
+    print("algorithm : %s" % FLAGS.algorithm)
+    print("timesteps : %s" % FLAGS.timesteps)
+    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
+    print("prioritized : %s" % FLAGS.prioritized)
+    print("dueling : %s" % FLAGS.dueling)
+    print("num_agents : %s" % FLAGS.num_agents)
+    print("lr : %s" % FLAGS.lr)
+
+    if FLAGS.lr == 0:
+        FLAGS.lr = random.uniform(0.00001, 0.001)
+    print("random lr : %s" % FLAGS.lr)
+    lr_round = round(FLAGS.lr, 8)
+
+    logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time)
+
+    Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)])
+
+    num_timesteps = int(40e6)
+    num_timesteps //= 4
+
+    seed = 0
+
+    env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
+                        FLAGS.map)
+
+    policy_fn = CnnPolicy
+    a2c.learn(
+        policy_fn,
+        env,
+        seed,
+        total_timesteps=num_timesteps,
+        nprocs=FLAGS.num_agents + FLAGS.num_scripts,
+        nscripts=FLAGS.num_scripts,
+        ent_coef=0.5,
+        nsteps=FLAGS.nsteps,
+        max_grad_norm=0.01,
+        callback=a2c_callback)
+
+from baselines import logger
+
+def a2c_callback(locals, globals):
+    global max_mean_reward, last_filename
+
+    logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c'])
+    logger.record_tabular("num_episodes", locals['num_episodes'])
+    logger.record_tabular("environment_number", locals['env_num'])
+    logger.record_tabular("done", locals['done'])
+
+    if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward:
+        print("mean_100ep_reward_a2c : %s max_mean_reward : %s" %
+              (locals['mean_100ep_reward_a2c'], max_mean_reward))
+        max_mean_reward = locals['mean_100ep_reward_a2c']
+        logger.record_tabular("max_mean_reward", max_mean_reward)
+
+        if not os.path.exists(os.path.join(PROJ_DIR, 'models/a2c/')):
+            try:
+                os.mkdir(os.path.join(PROJ_DIR, 'models/'))
+            except Exception as e:
+                print(str(e))
+            try:
+                os.mkdir(os.path.join(PROJ_DIR, 'models/a2c/'))
+            except Exception as e:
+                print(str(e))
+
+        if last_filename != "":
+            os.remove(last_filename)
+            print("delete last model file : %s" % last_filename)
+
+        model = locals['model']
+
+        filename = os.path.join(PROJ_DIR, 'models/a2c/mineral_%s.pkl' % locals['mean_100ep_reward_a2c'])
+        model.save(filename)
+        print("save best mean_100ep_reward model to %s" % filename)
+        last_filename = filename
+
+    logger.dump_tabular()
+
+if __name__ == '__main__':
+    main()

From 3ef6fa33931c67331a75160f7f4828a415bb0842 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Thu, 25 Feb 2021 20:58:07 -0500
Subject: [PATCH 05/11] Making things more readable

---
 a2c/a2c.py                  | 419 +++++++-----------------------------
 train_mineral_shards_a2c.py |  13 +-
 2 files changed, 85 insertions(+), 347 deletions(-)

diff --git a/a2c/a2c.py b/a2c/a2c.py
index 68afcc4..76954a4 100644
--- a/a2c/a2c.py
+++ b/a2c/a2c.py
@@ -5,18 +5,14 @@
 import tensorflow as tf
 from baselines import logger
 
-from baselines.common import set_global_seeds, explained_variance
+from baselines.common import set_global_seeds
 
 from baselines.a2c.utils import discount_with_dones
 from baselines.a2c.utils import Scheduler, find_trainable_variables
 from baselines.a2c.utils import cat_entropy
-# from a2c import kfac
 
-from pysc2.env import environment
 from pysc2.lib import actions as sc2_actions
 
-from common import common
-
 import nsml
 
 _CONTROL_GROUP_RECALL = 0
@@ -30,70 +26,34 @@ def mse(pred, target):
 
 
 class Model(object):
-    def __init__(self,
-                 policy,
-                 ob_space,
-                 ac_space,
-                 nenvs,
-                 total_timesteps,
-                 nprocs=32,
-                 nscripts=16,
-                 nsteps=20,
-                 nstack=4,
-                 ent_coef=0.1,
-                 vf_coef=0.5,
-                 vf_fisher_coef=1.0,
-                 lr=0.25,
-                 max_grad_norm=0.001,
-                 kfac_clip=0.001,
-                 lrschedule='linear',
-                 alpha=0.99,
-                 epsilon=1e-5):
-        config = tf.ConfigProto(
-            allow_soft_placement=True,
-            intra_op_parallelism_threads=nprocs,
-            inter_op_parallelism_threads=nprocs)
+    def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, lr=0.25, max_grad_norm=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5):
+        config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs)
         config.gpu_options.allow_growth = True
         self.sess = sess = tf.Session(config=config)
         nsml.bind(sess=sess)
-        # nact = ac_space.n
         nbatch = nenvs * nsteps
-        A = tf.placeholder(tf.int32, [nbatch])
-
-        XY0 = tf.placeholder(tf.int32, [nbatch])
-        XY1 = tf.placeholder(tf.int32, [nbatch])
+        a = tf.placeholder(tf.int32, [nbatch])
 
-        # ADV == TD_TARGET - values
-        ADV = tf.placeholder(tf.float32, [nbatch])
-        TD_TARGET = tf.placeholder(tf.float32, [nbatch])
-        PG_LR = tf.placeholder(tf.float32, [])
-        VF_LR = tf.placeholder(tf.float32, [])
+        xy0 = tf.placeholder(tf.int32, [nbatch])
+        xy1 = tf.placeholder(tf.int32, [nbatch])
 
-        self.model = step_model = policy(
-            sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
-        self.model2 = train_model = policy(
-            sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
+        adv = tf.placeholder(tf.float32, [nbatch])
+        td_target = tf.placeholder(tf.float32, [nbatch])
+        pg_lr = tf.placeholder(tf.float32, [])
 
-        # Policy 1 : Base Action : train_model.pi label = A
+        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
 
-        script_mask = tf.concat(
-            [
-                tf.zeros([nscripts * nsteps, 1]),
-                tf.ones([(nprocs - nscripts) * nsteps, 1])
-            ],
-            axis=0)
+        script_mask = tf.concat([tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1])], axis=0)
 
         pi = train_model.pi
         pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
-        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
-        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits=pi, labels=A)
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(a, depth=3), axis=1)
+        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=a)
         neglogpac *= tf.stop_gradient(pac_weight)
 
-        inv_A = 1.0 - tf.cast(A, tf.float32)
-
-        xy0_mask = tf.cast(A, tf.float32)
-        xy1_mask = tf.cast(A, tf.float32)
+        xy0_mask = tf.cast(a, tf.float32)
+        xy1_mask = tf.cast(a, tf.float32)
 
         condition0 = tf.equal(xy0_mask, 2)
         xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
@@ -102,46 +62,32 @@ def __init__(self,
         condition1 = tf.equal(xy1_mask, 2)
         xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)
 
-        # One hot representation of chosen marine.
-        # [batch_size, 2]
         pi_xy0 = train_model.pi_xy0
         pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
-        pac_weight = tf.reduce_sum(
-            pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1)
 
-        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits=pi_xy0, labels=XY0)
+        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy0, labels=xy0)
         logpac_xy0 *= tf.stop_gradient(pac_weight)
         logpac_xy0 *= tf.cast(xy0_mask, tf.float32)
 
         pi_xy1 = train_model.pi_xy1
         pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
-        pac_weight = tf.reduce_sum(
-            pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1)
 
-        # 1D? 2D?
-        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits=pi_xy1, labels=XY1)
+        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy1, labels=xy1)
         logpac_xy1 *= tf.stop_gradient(pac_weight)
         logpac_xy1 *= tf.cast(xy1_mask, tf.float32)
 
-        pg_loss = tf.reduce_mean(ADV * neglogpac)
-        pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
-        pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)
+        pg_loss = tf.reduce_mean(adv * neglogpac)
+        pg_loss_xy0 = tf.reduce_mean(adv * logpac_xy0)
+        pg_loss_xy1 = tf.reduce_mean(adv * logpac_xy1)
 
         vf_ = tf.squeeze(train_model.vf)
 
-        vf_r = tf.concat(
-            [
-                tf.ones([nscripts * nsteps, 1]),
-                tf.zeros([(nprocs - nscripts) * nsteps, 1])
-            ],
-            axis=0) * TD_TARGET
+        vf_r = tf.concat([tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1])], axis=0) * td_target
         vf_masked = vf_ * script_mask + vf_r
 
-        # vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]
-
-        vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
+        vf_loss = tf.reduce_mean(mse(vf_masked, td_target))
         entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
         entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
         entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
@@ -154,48 +100,33 @@ def __init__(self,
         if max_grad_norm is not None:
             grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
         grads = list(zip(grads, params))
-        trainer = tf.train.RMSPropOptimizer(
-            learning_rate=lr, decay=alpha, epsilon=epsilon)
+        trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
         _train = trainer.apply_gradients(grads)
 
-        self.logits = logits = train_model.pi
-
-        # xy0
+        self.logits = train_model.pi
 
-        self.params_common = params_common = tf.get_collection(
-            tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
-        self.params_xy0 = params_xy0 = tf.get_collection(
-            tf.GraphKeys.TRAINABLE_VARIABLES,
-            scope='model/xy0') + params_common
+        self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
+        self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common
 
         train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss
 
-        self.grads_check_xy0 = grads_xy0 = tf.gradients(
-            train_loss_xy0, params_xy0)
+        self.grads_check_xy0 = grads_xy0 = tf.gradients(train_loss_xy0, params_xy0)
         if max_grad_norm is not None:
             grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)
 
         grads_xy0 = list(zip(grads_xy0, params_xy0))
-        trainer_xy0 = tf.train.RMSPropOptimizer(
-            learning_rate=lr, decay=alpha, epsilon=epsilon)
+        trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
         _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)
 
-        # xy1
-
-        self.params_xy1 = params_xy1 = tf.get_collection(
-            tf.GraphKeys.TRAINABLE_VARIABLES,
-            scope='model/xy1') + params_common
-
+        self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common
         train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss
 
-        self.grads_check_xy1 = grads_xy1 = tf.gradients(
-            train_loss_xy1, params_xy1)
+        self.grads_check_xy1 = grads_xy1 = tf.gradients(train_loss_xy1, params_xy1)
         if max_grad_norm is not None:
             grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)
 
         grads_xy1 = list(zip(grads_xy1, params_xy1))
-        trainer_xy1 = tf.train.RMSPropOptimizer(
-            learning_rate=lr, decay=alpha, epsilon=epsilon)
+        trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
         _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)
 
         self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
@@ -207,27 +138,19 @@ def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
 
             td_map = {
                 train_model.X: obs,
-                A: actions,
-                XY0: xy0,
-                XY1: xy1,
-                ADV: advs,
-                TD_TARGET: td_targets,
-                PG_LR: cur_lr
+                a: actions,
+                xy0: xy0,
+                xy1: xy1,
+                adv: advs,
+                td_target: td_targets,
+                pg_lr: cur_lr
             }
             if states != []:
                 td_map[train_model.S] = states
                 td_map[train_model.M] = masks
 
-            policy_loss, value_loss, policy_entropy, _, \
-            policy_loss_xy0, policy_entropy_xy0, _, \
-            policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
-                [pg_loss, vf_loss, entropy, _train,
-                 pg_loss_xy0, entropy_xy0, _train_xy0,
-                 pg_loss_xy1, entropy_xy1, _train_xy1],
-                td_map)
-            return policy_loss, value_loss, policy_entropy, \
-                   policy_loss_xy0, policy_entropy_xy0, \
-                   policy_loss_xy1, policy_entropy_xy1
+            policy_loss, value_loss, policy_entropy, _, policy_loss_xy0, policy_entropy_xy0, _, policy_loss_xy1, policy_entropy_xy1, _ = sess.run([pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map)
+            return policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1
 
         def save(save_path):
             ps = sess.run(params)
@@ -254,14 +177,7 @@ def load(load_path):
 
 
 class Runner(object):
-    def __init__(self,
-                 env,
-                 model,
-                 nsteps,
-                 nscripts,
-                 nstack,
-                 gamma,
-                 callback=None):
+    def __init__(self, env, model, nsteps, nscripts, nstack, gamma, callback=None):
         self.env = env
         self.model = model
         nh, nw, nc = (32, 32, 3)
@@ -280,7 +196,7 @@ def __init__(self,
         self.army_counts = army_counts
         self.control_groups = control_groups
         self.selected = selected
-        self.update_obs(obs)  # (2,13,32,32)
+        self.update_obs(obs)
         self.update_available(available_actions)
         self.gamma = gamma
         self.states = model.initial_state
@@ -300,13 +216,12 @@ def __init__(self,
 
         self.group_id = [0 for _ in range(nenv)]
 
-    def update_obs(self, obs):  # (self.nenv, 32, 32, 2)
+    def update_obs(self, obs):
         obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
         self.obs = np.roll(self.obs, shift=-3, axis=3)
         new_map = np.zeros((self.nenv, 32, 32, 3))
         new_map[:, :, :, -1] = obs[:, 0, :, :]
         for env_num in range(self.nenv):
-            # print("xy_per_marine: ", self.xy_per_marine)
             if "0" not in self.xy_per_marine[env_num]:
                 self.xy_per_marine[env_num]["0"] = [0, 0]
             if "1" not in self.xy_per_marine[env_num]:
@@ -317,39 +232,29 @@ def update_obs(self, obs):  # (self.nenv, 32, 32, 2)
             new_map[env_num, marine0[0], marine0[1], -3] = 1
             new_map[env_num, marine1[0], marine1[1], -2] = 1
         self.obs[:, :, :, -3:] = new_map
-        # could not broadcast input array from shape (4,1,32,32) into shape (4,4,32)
 
     def update_available(self, _available_actions):
-        # print("update_available : ", _available_actions)
         self.available_actions = _available_actions
-        # avail = np.array([[0,1,2,3,4,7], [0,1,2,3,4,7]])
         self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8)
         for env_num, list in enumerate(_available_actions):
-            # print("env_num :", env_num, " list :", list)
             for action_num in list:
-                # print("action_num :", action_num)
-                if (action_num == 4):
+                if action_num == 4:
                     self.base_act_mask[env_num][0] = 1
                     self.base_act_mask[env_num][1] = 1
                 elif action_num == 0:
                     self.base_act_mask[env_num][2] = 1
-                    # elif(action_num == 331):
-                    #   self.base_act_mask[env_num][2] = 1
 
     def valid_base_action(self, base_actions):
         for env_num, list in enumerate(self.available_actions):
             avail = []
             for action_num in list:
-                if (action_num == 4):
+                if action_num == 4:
                     avail.append(0)
                     avail.append(1)
                 elif action_num == 0:
                     avail.append(2)
-                    # elif(action_num == 331):
-                    #   avail.append(2)
 
             if base_actions[env_num] not in avail:
-                # print("env_num", env_num, " argmax is not valid. random pick ", avail)
                 base_actions[env_num] = np.random.choice(avail)
 
         return base_actions
@@ -357,99 +262,44 @@ def valid_base_action(self, base_actions):
     def trans_base_actions(self, base_actions):
         new_base_actions = np.copy(base_actions)
         for env_num, ba in enumerate(new_base_actions):
-            if (ba == 0):
-                new_base_actions[env_num] = 4  # move marine control group 0
-            elif (ba == 1):
-                new_base_actions[env_num] = 4  # move marine control group 1
-            elif (ba == 2):
-                new_base_actions[env_num] = 0  # move marine control group 1
-                # elif(ba==2):
-                #   new_base_actions[env_num] = 331 # move marine xy0
+            if ba == 0:
+                new_base_actions[env_num] = 4
+            elif ba == 1:
+                new_base_actions[env_num] = 4
+            elif ba == 2:
+                new_base_actions[env_num] = 0
 
         return new_base_actions
 
     def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1):
         actions = []
         for env_num, spec in enumerate(base_action_spec):
-            # print("spec", spec.args)
-            args = []
-            # for arg_idx, arg in enumerate(spec.args):
-            #   #print("arg", arg)
-            #   #print("arg.id", arg.id)
-            #   if(arg.id==0): # screen (32,32) x0, y0
-            #     args.append([int(x0[env_num]), int(y0[env_num])])
-            #   # elif(arg.id==1): # minimap (32,32) x1, y1
-            #   #   args.append([int(x1[env_num]), int(y1[env_num])])
-            #   # elif(arg.id==2): # screen2 (32,32) x2, y2
-            #   #   args.append([int(x2[env_num]), y2[env_num]])
-            #   elif(arg.id==3): # pi3 queued (2)
-            #     args.append([int(0)])
-            #   elif(arg.id==4): # pi4 control_group_act (5)
-            #     args.append([_CONTROL_GROUP_RECALL])
-            #   elif(arg.id==5): # pi5 control_group_id 10
-            #     args.append([int(base_actions[env_num])]) # 0 => cg 0 / 1 => cg 1
-            #   # elif(arg.id==6): # pi6 select_point_act 4
-            #   #   args.append([int(sub6[env_num])])
-            #   # elif(arg.id==7): # pi7 select_add 2
-            #   #   args.append([int(sub7[env_num])])
-            #   # elif(arg.id==8): # pi8 select_unit_act 4
-            #   #   args.append([int(sub8[env_num])])
-            #   # elif(arg.id==9): # pi9 select_unit_id 500
-            #   #   args.append([int(sub9[env_num])])
-            #   # elif(arg.id==10): # pi10 select_worker 4
-            #   #   args.append([int(sub10[env_num])])
-            #   # elif(arg.id==11): # pi11 build_queue_id 10
-            #   #   args.append([int(sub11[env_num])])
-            #   # elif(arg.id==12): # pi12 unload_id 500
-            #   #   args.append([int(sub12[env_num])])
-            #   else:
-            #     raise NotImplementedError("cannot construct this arg", spec.args)
             two_action = []
             if base_actions[env_num] == 0:
-                two_action.append(
-                    sc2_actions.FunctionCall(
-                        4,
-                        [[_CONTROL_GROUP_RECALL], [0]]
-                    ))
-
-                two_action.append(
-                    sc2_actions.FunctionCall(
-                        331,
-                        [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]]))
+                two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [0]]))
+                two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]]))
 
             elif base_actions[env_num] == 1:
-                two_action.append(
-                    sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]]))
-                two_action.append(
-                    sc2_actions.FunctionCall(
-                        331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]]))
+                two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]]))
+                two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]]))
             elif base_actions[env_num] == 2:
-                two_action.append(
-                    sc2_actions.FunctionCall(0, []))
-                two_action.append(
-                    sc2_actions.FunctionCall(0, []))
+                two_action.append(sc2_actions.FunctionCall(0, []))
+                two_action.append(sc2_actions.FunctionCall(0, []))
 
-            # action = sc2_actions.FunctionCall(a, args)
             actions.append(two_action)
 
         return actions
 
     def run(self):
-        mb_obs, mb_td_targets, mb_base_actions, \
-        mb_xy0, mb_xy1, \
-        mb_values, mb_dones \
-            = [], [], [], [], [], [], []
+        mb_obs, mb_td_targets, mb_base_actions, mb_xy0, mb_xy1, mb_values, mb_dones = [], [], [], [], [], [], []
 
         mb_states = self.states
         for n in range(self.nsteps):
-            # pi, pi2, x1, y1, x2, y2, v0
-            pi1, pi_xy0, pi_xy1, values, states = self.model.step(
-                self.obs, self.states, self.dones)
+            pi1, pi_xy0, pi_xy1, values, states = self.model.step(self.obs, self.states, self.dones)
 
             pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3
 
-            base_actions = np.argmax(
-                pi1 * self.base_act_mask + pi1_noise, axis=1)
+            base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise, axis=1)
             xy0 = np.argmax(pi_xy0, axis=1)
 
             x0 = (xy0 % 32).astype(int)
@@ -459,59 +309,11 @@ def run(self):
             x1 = (xy1 % 32).astype(int)
             y1 = (xy1 / 32).astype(int)
 
-            # Scripted Agent Hacking
-
-            for env_num in range(self.nenv):
-                if env_num >= self.nscripts:  # only for scripted agents
-                    continue
-
-                ob = self.obs[env_num, :, :, :]
-                player_relative = ob[:, :, -1]
-
-                self.group_list[env_num] = common.update_group_list2(
-                    self.control_groups[env_num])
-
-                if len(self.action_queue[env_num]) == 0:
-                    self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], \
-                    self.xy_per_marine[env_num] = \
-                        common.solve_tsp(player_relative,
-                                         self.selected[env_num][0],
-                                         self.group_list[env_num],
-                                         self.group_id[env_num],
-                                         self.dest_per_marine[env_num],
-                                         self.xy_per_marine[env_num])
-
-                base_actions[env_num] = 0
-                x0[env_num] = 0
-                y0[env_num] = 0
-                x1[env_num] = 0
-                y1[env_num] = 0
-
-                if len(self.action_queue[env_num]) > 0:
-                    action = self.action_queue[env_num].pop(0)
-                    base_actions[env_num] = action.get("base_action", 0)
-
-                    x0[env_num] = action.get("x0", 0)
-                    y0[env_num] = action.get("y0", 0)
-                    xy0[env_num] = y0[env_num] * 32 + x0[env_num]
-
-                    x1[env_num] = action.get("x1", 0)
-                    y1[env_num] = action.get("y1", 0)
-                    xy1[env_num] = y1[env_num] * 32 + x1[env_num]
-
             base_actions = self.valid_base_action(base_actions)
             new_base_actions = self.trans_base_actions(base_actions)
 
             base_action_spec = self.env.action_spec(new_base_actions)
-            # print("base_actions:", base_actions)
-            actions = self.construct_action(
-                base_actions,
-                base_action_spec,
-                x0,
-                y0,
-                x1,
-                y1
-            )
+            actions = self.construct_action(base_actions, base_action_spec, x0, y0, x1, y1)
 
             mb_obs.append(np.copy(self.obs))
             mb_base_actions.append(base_actions)
@@ -521,12 +323,7 @@ def run(self):
             mb_values.append(values)
             mb_dones.append(self.dones)
 
-            # print("final acitons : ", actions)
-            obs, rewards, dones, \
-            available_actions, army_counts, \
-            control_groups, selected, xy_per_marine \
-                = self.env.step(
-                actions=actions)
+            obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(actions=actions)
             self.army_counts = army_counts
             self.control_groups = control_groups
             self.selected = selected
@@ -546,36 +343,11 @@ def run(self):
                     self.episode_rewards.append(self.total_reward[n])
 
                     model = self.model
-                    mean_100ep_reward = round(
-                        np.mean(self.episode_rewards[-101:]), 1)
-                    if (n < self.nscripts):  # scripted agents
-                        self.episode_rewards_script.append(
-                            self.total_reward[n])
-                        mean_100ep_reward_script = round(
-                            np.mean(self.episode_rewards_script[-101:]), 1)
-                        nsml.report(
-                            reward_script=self.total_reward[n],
-                            mean_reward_script=mean_100ep_reward_script,
-                            reward=self.total_reward[n],
-                            mean_100ep_reward=mean_100ep_reward,
-                            episodes=self.episodes,
-                            step=self.episodes,
-                            scope=locals()
-                        )
-                    else:
-                        self.episode_rewards_a2c.append(self.total_reward[n])
-                        mean_100ep_reward_a2c = round(
-                            np.mean(self.episode_rewards_a2c[-101:]), 1)
-                        nsml.report(
-                            reward_a2c=self.total_reward[n],
-                            mean_reward_a2c=mean_100ep_reward_a2c,
-                            reward=self.total_reward[n],
-                            mean_100ep_reward=mean_100ep_reward,
-                            episodes=self.episodes,
-                            step=self.episodes,
-                            scope=locals()
-                        )
-                        print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)
+                    mean_100ep_reward = round(np.mean(self.episode_rewards[-101:]), 1)
+                    self.episode_rewards_a2c.append(self.total_reward[n])
+                    mean_100ep_reward_a2c = round(np.mean(self.episode_rewards_a2c[-101:]), 1)
+                    nsml.report(reward_a2c=self.total_reward[n], mean_reward_a2c=mean_100ep_reward_a2c, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals())
+                    print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)
 
                     if self.callback is not None:
                         self.callback(locals(), globals())
@@ -585,13 +357,9 @@ def run(self):
             self.update_obs(obs)
             mb_td_targets.append(rewards)
         mb_dones.append(self.dones)
-        # batch of steps to batch of rollouts
-        mb_obs = np.asarray(
-            mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
-            self.batch_ob_shape)
+        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
         mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
-        mb_base_actions = np.asarray(
-            mb_base_actions, dtype=np.int32).swapaxes(1, 0)
+        mb_base_actions = np.asarray(mb_base_actions, dtype=np.int32).swapaxes(1, 0)
 
         mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
         mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)
@@ -600,16 +368,13 @@ def run(self):
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
-        last_values = self.model.value(self.obs, self.states,
-                                       self.dones).tolist()
-        # discount/bootstrap off value fn
-        for n, (rewards, dones, value) in enumerate(
-                zip(mb_td_targets, mb_dones, last_values)):
+        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
+
+        for n, (rewards, dones, value) in enumerate(zip(mb_td_targets, mb_dones, last_values)):
             rewards = rewards.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
-                rewards = discount_with_dones(rewards + [value], dones + [0],
-                                              self.gamma)[:-1]
+                rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1]
             else:
                 rewards = discount_with_dones(rewards, dones, self.gamma)
             mb_td_targets[n] = rewards
@@ -620,8 +385,7 @@ def run(self):
 
         mb_values = mb_values.flatten()
         mb_masks = mb_masks.flatten()
-        return mb_obs, mb_states, mb_td_targets, mb_masks, \
-               mb_base_actions, mb_xy0, mb_xy1, mb_values
+        return mb_obs, mb_states, mb_td_targets, mb_masks, mb_base_actions, mb_xy0, mb_xy1, mb_values
 
 
 def learn(policy,
@@ -629,17 +393,14 @@ def learn(policy,
           seed,
           total_timesteps=int(40e6),
           gamma=0.99,
-          log_interval=1,
           nprocs=24,
           nscripts=12,
           nsteps=20,
           nstack=4,
           ent_coef=0.01,
           vf_coef=0.5,
-          vf_fisher_coef=1.0,
           lr=0.25,
           max_grad_norm=0.01,
-          kfac_clip=0.001,
           save_interval=None,
           lrschedule='linear',
           callback=None):
@@ -649,19 +410,7 @@ def learn(policy,
     nenvs = nprocs
     ob_space = (32, 32, 3)  # env.observation_space
     ac_space = (32, 32)
-    make_model = lambda: Model(policy, ob_space, ac_space, nenvs,
-                               total_timesteps,
-                               nprocs=nprocs,
-                               nscripts=nscripts,
-                               nsteps=nsteps,
-                               nstack=nstack,
-                               ent_coef=ent_coef,
-                               vf_coef=vf_coef,
-                               vf_fisher_coef=vf_fisher_coef,
-                               lr=lr,
-                               max_grad_norm=max_grad_norm,
-                               kfac_clip=kfac_clip,
-                               lrschedule=lrschedule)
+    make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nscripts=nscripts, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, lr=lr, max_grad_norm=max_grad_norm, lrschedule=lrschedule)
 
     if save_interval and logger.get_dir():
         import cloudpickle
@@ -669,26 +418,14 @@ def learn(policy,
             fh.write(cloudpickle.dumps(make_model))
     model = make_model()
     print("make_model complete!")
-    runner = Runner(
-        env,
-        model,
-        nsteps=nsteps,
-        nscripts=nscripts,
-        nstack=nstack,
-        gamma=gamma,
-        callback=callback)
+    runner = Runner(env, model, nsteps=nsteps, nscripts=nscripts, nstack=nstack, gamma=gamma, callback=callback)
     nbatch = nenvs * nsteps
-    tstart = time.time()
-    # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
     for update in range(1, total_timesteps // nbatch + 1):
         obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run()
 
         policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values)
 
         model.old_obs = obs
-        nseconds = time.time() - tstart
-        fps = int((update * nbatch) / nseconds)
-
         if save_interval and (update % save_interval == 0
                               or update == 1) and logger.get_dir():
             savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py
index ba3e90f..aa0f6a9 100644
--- a/train_mineral_shards_a2c.py
+++ b/train_mineral_shards_a2c.py
@@ -5,16 +5,12 @@
 
 from absl import flags
 
-from pysc2.env import sc2_env
 from pysc2.lib import actions
-from baselines_legacy import cnn_to_mlp, BatchInput
-from baselines.logger import Logger, TensorBoardOutputFormat, HumanOutputFormat
+from baselines.logger import Logger, TensorBoardOutputFormat
 
 from common.vec_env.subproc_vec_env import SubprocVecEnv
 from a2c.policies import CnnPolicy
 from a2c import a2c
-import deepq_mineral_4way
-import deepq_mineral_shards
 
 _MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id
 _SELECT_ARMY = actions.FUNCTIONS.select_army.id
@@ -79,11 +75,16 @@ def main():
         env,
         seed,
         total_timesteps=num_timesteps,
+        gamma=0.99,
         nprocs=FLAGS.num_agents + FLAGS.num_scripts,
         nscripts=FLAGS.num_scripts,
-        ent_coef=0.5,
         nsteps=FLAGS.nsteps,
+        ent_coef=0.5,
+        vf_coef=0.5,
+        lr=0.25,
         max_grad_norm=0.01,
+        save_interval=1000,
+        lrschedule='linear',
         callback=a2c_callback)
 
 from baselines import logger

From b67106fb21aaa08843e2d8ccb65cf0281a16d1dc Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Tue, 2 Mar 2021 16:29:12 -0500
Subject: [PATCH 06/11] Adding a couple changes before I delete and reclone,
 because having issues with Conda env.

---
 a2c/__init__.py             |   0
 a2c/a2c.py                  | 414 +++---------------------------------
 a2c/model.py                | 178 ++++++++++++++++
 a2c/runner.py               | 253 ++++++++++++++++++++++
 train_mineral_shards_a2c.py |   9 +
 5 files changed, 472 insertions(+), 382 deletions(-)
 create mode 100644 a2c/__init__.py
 create mode 100644 a2c/model.py
 create mode 100644 a2c/runner.py

diff --git a/a2c/__init__.py b/a2c/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/a2c/a2c.py b/a2c/a2c.py
index 76954a4..61f1e6c 100644
--- a/a2c/a2c.py
+++ b/a2c/a2c.py
@@ -1,406 +1,32 @@
 import os.path as osp
 import time
-import joblib
 import numpy as np
 import tensorflow as tf
 from baselines import logger
 
 from baselines.common import set_global_seeds
 
-from baselines.a2c.utils import discount_with_dones
-from baselines.a2c.utils import Scheduler, find_trainable_variables
-from baselines.a2c.utils import cat_entropy
-
-from pysc2.lib import actions as sc2_actions
-
-import nsml
-
-_CONTROL_GROUP_RECALL = 0
-_NOT_QUEUED = 0
+from a2c.model import Model
+from a2c.runner import Runner
 
 np.set_printoptions(threshold=np.inf)
 
-
-def mse(pred, target):
-    return tf.square(pred - target) / 2.
-
-
-class Model(object):
-    def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, lr=0.25, max_grad_norm=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5):
-        config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs)
-        config.gpu_options.allow_growth = True
-        self.sess = sess = tf.Session(config=config)
-        nsml.bind(sess=sess)
-        nbatch = nenvs * nsteps
-        a = tf.placeholder(tf.int32, [nbatch])
-
-        xy0 = tf.placeholder(tf.int32, [nbatch])
-        xy1 = tf.placeholder(tf.int32, [nbatch])
-
-        adv = tf.placeholder(tf.float32, [nbatch])
-        td_target = tf.placeholder(tf.float32, [nbatch])
-        pg_lr = tf.placeholder(tf.float32, [])
-
-        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
-        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
-
-        script_mask = tf.concat([tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1])], axis=0)
-
-        pi = train_model.pi
-        pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
-        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(a, depth=3), axis=1)
-        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=a)
-        neglogpac *= tf.stop_gradient(pac_weight)
-
-        xy0_mask = tf.cast(a, tf.float32)
-        xy1_mask = tf.cast(a, tf.float32)
-
-        condition0 = tf.equal(xy0_mask, 2)
-        xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
-        xy0_mask = 1.0 - xy0_mask
-
-        condition1 = tf.equal(xy1_mask, 2)
-        xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)
-
-        pi_xy0 = train_model.pi_xy0
-        pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
-        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1)
-
-        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy0, labels=xy0)
-        logpac_xy0 *= tf.stop_gradient(pac_weight)
-        logpac_xy0 *= tf.cast(xy0_mask, tf.float32)
-
-        pi_xy1 = train_model.pi_xy1
-        pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
-        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(xy0, depth=1024), axis=1)
-
-        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy1, labels=xy1)
-        logpac_xy1 *= tf.stop_gradient(pac_weight)
-        logpac_xy1 *= tf.cast(xy1_mask, tf.float32)
-
-        pg_loss = tf.reduce_mean(adv * neglogpac)
-        pg_loss_xy0 = tf.reduce_mean(adv * logpac_xy0)
-        pg_loss_xy1 = tf.reduce_mean(adv * logpac_xy1)
-
-        vf_ = tf.squeeze(train_model.vf)
-
-        vf_r = tf.concat([tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1])], axis=0) * td_target
-        vf_masked = vf_ * script_mask + vf_r
-
-        vf_loss = tf.reduce_mean(mse(vf_masked, td_target))
-        entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
-        entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
-        entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
-        entropy = entropy_a + entropy_xy0 + entropy_xy1
-
-        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
-
-        params = find_trainable_variables("model")
-        grads = tf.gradients(loss, params)
-        if max_grad_norm is not None:
-            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
-        grads = list(zip(grads, params))
-        trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
-        _train = trainer.apply_gradients(grads)
-
-        self.logits = train_model.pi
-
-        self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
-        self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common
-
-        train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss
-
-        self.grads_check_xy0 = grads_xy0 = tf.gradients(train_loss_xy0, params_xy0)
-        if max_grad_norm is not None:
-            grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)
-
-        grads_xy0 = list(zip(grads_xy0, params_xy0))
-        trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
-        _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)
-
-        self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common
-        train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss
-
-        self.grads_check_xy1 = grads_xy1 = tf.gradients(train_loss_xy1, params_xy1)
-        if max_grad_norm is not None:
-            grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)
-
-        grads_xy1 = list(zip(grads_xy1, params_xy1))
-        trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
-        _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)
-
-        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
-
-        def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
-            advs = td_targets - values
-            for step in range(len(obs)):
-                cur_lr = self.lr.value()
-
-            td_map = {
-                train_model.X: obs,
-                a: actions,
-                xy0: xy0,
-                xy1: xy1,
-                adv: advs,
-                td_target: td_targets,
-                pg_lr: cur_lr
-            }
-            if states != []:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
-
-            policy_loss, value_loss, policy_entropy, _, policy_loss_xy0, policy_entropy_xy0, _, policy_loss_xy1, policy_entropy_xy1, _ = sess.run([pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map)
-            return policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1
-
-        def save(save_path):
-            ps = sess.run(params)
-            joblib.dump(ps, save_path)
-
-        def load(load_path):
-            loaded_params = joblib.load(load_path)
-            restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
-            sess.run(restores)
-
-        self.train = train
-        self.save = save
-        self.load = load
-        self.train_model = train_model
-        self.step_model = step_model
-        self.step = step_model.step
-        self.value = step_model.value
-        self.initial_state = step_model.initial_state
-        print("global_variables_initializer start")
-        tf.global_variables_initializer().run(session=sess)
-        print("global_variables_initializer complete")
-
-
-class Runner(object):
-    def __init__(self, env, model, nsteps, nscripts, nstack, gamma, callback=None):
-        self.env = env
-        self.model = model
-        nh, nw, nc = (32, 32, 3)
-        self.nsteps = nsteps
-        self.nscripts = nscripts
-        self.nenv = nenv = env.num_envs
-        self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)
-        self.batch_coord_shape = (nenv * nsteps, 32)
-        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
-        self.available_actions = None
-        self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8)
-        obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset()
-        self.xy_per_marine = [{"0": [0, 0], "1": [0, 0]} for _ in range(nenv)]
-        for env_num, data in enumerate(xy_per_marine):
-            self.xy_per_marine[env_num] = data
-        self.army_counts = army_counts
-        self.control_groups = control_groups
-        self.selected = selected
-        self.update_obs(obs)
-        self.update_available(available_actions)
-        self.gamma = gamma
-        self.states = model.initial_state
-        self.dones = [False for _ in range(nenv)]
-        self.total_reward = [0.0 for _ in range(nenv)]
-        self.episode_rewards = []
-        self.episode_rewards_script = []
-        self.episode_rewards_a2c = []
-        self.episodes = 0
-        self.steps = 0
-        self.callback = callback
-
-        self.action_queue = [[] for _ in range(nenv)]
-        self.group_list = [[] for _ in range(nenv)]
-        self.agent_state = ["IDLE" for _ in range(nenv)]
-        self.dest_per_marine = [{} for _ in range(nenv)]
-
-        self.group_id = [0 for _ in range(nenv)]
-
-    def update_obs(self, obs):
-        obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
-        self.obs = np.roll(self.obs, shift=-3, axis=3)
-        new_map = np.zeros((self.nenv, 32, 32, 3))
-        new_map[:, :, :, -1] = obs[:, 0, :, :]
-        for env_num in range(self.nenv):
-            if "0" not in self.xy_per_marine[env_num]:
-                self.xy_per_marine[env_num]["0"] = [0, 0]
-            if "1" not in self.xy_per_marine[env_num]:
-                self.xy_per_marine[env_num]["1"] = [0, 0]
-
-            marine0 = self.xy_per_marine[env_num]["0"]
-            marine1 = self.xy_per_marine[env_num]["1"]
-            new_map[env_num, marine0[0], marine0[1], -3] = 1
-            new_map[env_num, marine1[0], marine1[1], -2] = 1
-        self.obs[:, :, :, -3:] = new_map
-
-    def update_available(self, _available_actions):
-        self.available_actions = _available_actions
-        self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8)
-        for env_num, list in enumerate(_available_actions):
-            for action_num in list:
-                if action_num == 4:
-                    self.base_act_mask[env_num][0] = 1
-                    self.base_act_mask[env_num][1] = 1
-                elif action_num == 0:
-                    self.base_act_mask[env_num][2] = 1
-
-    def valid_base_action(self, base_actions):
-        for env_num, list in enumerate(self.available_actions):
-            avail = []
-            for action_num in list:
-                if action_num == 4:
-                    avail.append(0)
-                    avail.append(1)
-                elif action_num == 0:
-                    avail.append(2)
-
-            if base_actions[env_num] not in avail:
-                base_actions[env_num] = np.random.choice(avail)
-
-        return base_actions
-
-    def trans_base_actions(self, base_actions):
-        new_base_actions = np.copy(base_actions)
-        for env_num, ba in enumerate(new_base_actions):
-            if ba == 0:
-                new_base_actions[env_num] = 4
-            elif ba == 1:
-                new_base_actions[env_num] = 4
-            elif ba == 2:
-                new_base_actions[env_num] = 0
-
-        return new_base_actions
-
-    def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1):
-        actions = []
-        for env_num, spec in enumerate(base_action_spec):
-            two_action = []
-            if base_actions[env_num] == 0:
-                two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [0]]))
-                two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]]))
-
-            elif base_actions[env_num] == 1:
-                two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]]))
-                two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]]))
-            elif base_actions[env_num] == 2:
-                two_action.append(sc2_actions.FunctionCall(0, []))
-                two_action.append(sc2_actions.FunctionCall(0, []))
-
-            actions.append(two_action)
-
-        return actions
-
-    def run(self):
-        mb_obs, mb_td_targets, mb_base_actions, mb_xy0, mb_xy1, mb_values, mb_dones = [], [], [], [], [], [], []
-
-        mb_states = self.states
-        for n in range(self.nsteps):
-            pi1, pi_xy0, pi_xy1, values, states = self.model.step(self.obs, self.states, self.dones)
-
-            pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3
-
-            base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise, axis=1)
-            xy0 = np.argmax(pi_xy0, axis=1)
-
-            x0 = (xy0 % 32).astype(int)
-            y0 = (xy0 / 32).astype(int)
-
-            xy1 = np.argmax(pi_xy1, axis=1)
-            x1 = (xy1 % 32).astype(int)
-            y1 = (xy1 / 32).astype(int)
-
-            base_actions = self.valid_base_action(base_actions)
-            new_base_actions = self.trans_base_actions(base_actions)
-
-            base_action_spec = self.env.action_spec(new_base_actions)
-            actions = self.construct_action(base_actions, base_action_spec, x0, y0, x1, y1)
-
-            mb_obs.append(np.copy(self.obs))
-            mb_base_actions.append(base_actions)
-
-            mb_xy0.append(xy0)
-            mb_xy1.append(xy1)
-            mb_values.append(values)
-            mb_dones.append(self.dones)
-
-            obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(actions=actions)
-            self.army_counts = army_counts
-            self.control_groups = control_groups
-            self.selected = selected
-            for env_num, data in enumerate(xy_per_marine):
-                self.xy_per_marine[env_num] = data
-            self.update_available(available_actions)
-
-            self.states = states
-            self.dones = dones
-            mean_100ep_reward_a2c = 0
-            for n, done in enumerate(dones):
-                self.total_reward[n] += float(rewards[n])
-                if done:
-                    self.obs[n] = self.obs[n] * 0
-                    self.episodes += 1
-                    num_episodes = self.episodes
-                    self.episode_rewards.append(self.total_reward[n])
-
-                    model = self.model
-                    mean_100ep_reward = round(np.mean(self.episode_rewards[-101:]), 1)
-                    self.episode_rewards_a2c.append(self.total_reward[n])
-                    mean_100ep_reward_a2c = round(np.mean(self.episode_rewards_a2c[-101:]), 1)
-                    nsml.report(reward_a2c=self.total_reward[n], mean_reward_a2c=mean_100ep_reward_a2c, reward=self.total_reward[n], mean_100ep_reward=mean_100ep_reward, episodes=self.episodes, step=self.episodes, scope=locals())
-                    print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)
-
-                    if self.callback is not None:
-                        self.callback(locals(), globals())
-                    self.total_reward[n] = 0
-                    self.group_list[n] = []
-
-            self.update_obs(obs)
-            mb_td_targets.append(rewards)
-        mb_dones.append(self.dones)
-        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
-        mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
-        mb_base_actions = np.asarray(mb_base_actions, dtype=np.int32).swapaxes(1, 0)
-
-        mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
-        mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)
-
-        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
-        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
-        mb_masks = mb_dones[:, :-1]
-        mb_dones = mb_dones[:, 1:]
-        last_values = self.model.value(self.obs, self.states, self.dones).tolist()
-
-        for n, (rewards, dones, value) in enumerate(zip(mb_td_targets, mb_dones, last_values)):
-            rewards = rewards.tolist()
-            dones = dones.tolist()
-            if dones[-1] == 0:
-                rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1]
-            else:
-                rewards = discount_with_dones(rewards, dones, self.gamma)
-            mb_td_targets[n] = rewards
-        mb_td_targets = mb_td_targets.flatten()
-        mb_base_actions = mb_base_actions.flatten()
-        mb_xy0 = mb_xy0.flatten()
-        mb_xy1 = mb_xy1.flatten()
-
-        mb_values = mb_values.flatten()
-        mb_masks = mb_masks.flatten()
-        return mb_obs, mb_states, mb_td_targets, mb_masks, mb_base_actions, mb_xy0, mb_xy1, mb_values
-
-
 def learn(policy,
           env,
           seed,
           total_timesteps=int(40e6),
           gamma=0.99,
+          log_interval=1,
           nprocs=24,
           nscripts=12,
           nsteps=20,
           nstack=4,
           ent_coef=0.01,
           vf_coef=0.5,
+          vf_fisher_coef=1.0,
           lr=0.25,
           max_grad_norm=0.01,
+          kfac_clip=0.001,
           save_interval=None,
           lrschedule='linear',
           callback=None):
@@ -410,7 +36,19 @@ def learn(policy,
     nenvs = nprocs
     ob_space = (32, 32, 3)  # env.observation_space
     ac_space = (32, 32)
-    make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nscripts=nscripts, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, lr=lr, max_grad_norm=max_grad_norm, lrschedule=lrschedule)
+    make_model = lambda: Model(policy, ob_space, ac_space, nenvs,
+                               total_timesteps,
+                               nprocs=nprocs,
+                               nscripts=nscripts,
+                               nsteps=nsteps,
+                               nstack=nstack,
+                               ent_coef=ent_coef,
+                               vf_coef=vf_coef,
+                               vf_fisher_coef=vf_fisher_coef,
+                               lr=lr,
+                               max_grad_norm=max_grad_norm,
+                               kfac_clip=kfac_clip,
+                               lrschedule=lrschedule)
 
     if save_interval and logger.get_dir():
         import cloudpickle
@@ -418,14 +56,26 @@ def learn(policy,
             fh.write(cloudpickle.dumps(make_model))
     model = make_model()
     print("make_model complete!")
-    runner = Runner(env, model, nsteps=nsteps, nscripts=nscripts, nstack=nstack, gamma=gamma, callback=callback)
+    runner = Runner(
+        env,
+        model,
+        nsteps=nsteps,
+        nscripts=nscripts,
+        nstack=nstack,
+        gamma=gamma,
+        callback=callback)
     nbatch = nenvs * nsteps
+    tstart = time.time()
+    # enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True)
     for update in range(1, total_timesteps // nbatch + 1):
         obs, states, td_targets, masks, actions, xy0, xy1, values = runner.run()
 
-        policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values)
+        model.policy_loss, model.value_loss, model.policy_entropy, model.policy_loss_xy0, model.policy_entropy_xy0, model.policy_loss_xy1, model.policy_entropy_xy1, = model.train(obs, states, td_targets, masks, actions, xy0, xy1, values)
 
         model.old_obs = obs
+        nseconds = time.time() - tstart
+        fps = int((update * nbatch) / nseconds)
+
         if save_interval and (update % save_interval == 0
                               or update == 1) and logger.get_dir():
             savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
diff --git a/a2c/model.py b/a2c/model.py
new file mode 100644
index 0000000..76aeb9d
--- /dev/null
+++ b/a2c/model.py
@@ -0,0 +1,178 @@
+import joblib
+import tensorflow as tf
+from baselines.a2c.utils import cat_entropy, find_trainable_variables, Scheduler
+
+import nsml
+
+
+class Model(object):
+    def __init__(self,
+                 policy,
+                 ob_space,
+                 ac_space,
+                 nenvs,
+                 total_timesteps,
+                 nprocs=32,
+                 nscripts=16,
+                 nsteps=20,
+                 nstack=4,
+                 ent_coef=0.1,
+                 vf_coef=0.5,
+                 vf_fisher_coef=1.0,
+                 lr=0.25,
+                 max_grad_norm=0.001,
+                 kfac_clip=0.001,
+                 lrschedule='linear',
+                 alpha=0.99,
+                 epsilon=1e-5):
+        config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs)
+        config.gpu_options.allow_growth = True
+        self.sess = sess = tf.Session(config=config)
+        nsml.bind(sess=sess)
+        nbatch = nenvs * nsteps
+        A = tf.placeholder(tf.int32, [nbatch])
+
+        XY0 = tf.placeholder(tf.int32, [nbatch])
+        XY1 = tf.placeholder(tf.int32, [nbatch])
+
+        ADV = tf.placeholder(tf.float32, [nbatch])
+        TD_TARGET = tf.placeholder(tf.float32, [nbatch])
+        PG_LR = tf.placeholder(tf.float32, [])
+
+        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
+        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
+
+        script_mask = tf.concat([tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1])], axis=0)
+
+        pi = train_model.pi
+        pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
+        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=A)
+        neglogpac *= tf.stop_gradient(pac_weight)
+
+        xy0_mask = tf.cast(A, tf.float32)
+        xy1_mask = tf.cast(A, tf.float32)
+
+        condition0 = tf.equal(xy0_mask, 2)
+        xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
+        xy0_mask = 1.0 - xy0_mask
+
+        condition1 = tf.equal(xy1_mask, 2)
+        xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)
+
+        pi_xy0 = train_model.pi_xy0
+        pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
+
+        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy0, labels=XY0)
+        logpac_xy0 *= tf.stop_gradient(pac_weight)
+        logpac_xy0 *= tf.cast(xy0_mask, tf.float32)
+
+        pi_xy1 = train_model.pi_xy1
+        pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
+        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1)
+
+        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi_xy1, labels=XY1)
+        logpac_xy1 *= tf.stop_gradient(pac_weight)
+        logpac_xy1 *= tf.cast(xy1_mask, tf.float32)
+
+        pg_loss = tf.reduce_mean(ADV * neglogpac)
+        pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
+        pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)
+
+        vf_ = tf.squeeze(train_model.vf)
+
+        vf_r = tf.concat([tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1])], axis=0) * TD_TARGET
+        vf_masked = vf_ * script_mask + vf_r
+
+        vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
+        entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
+        entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
+        entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
+        entropy = entropy_a + entropy_xy0 + entropy_xy1
+
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        params = find_trainable_variables("model")
+        grads = tf.gradients(loss, params)
+        if max_grad_norm is not None:
+            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
+        _train = trainer.apply_gradients(grads)
+
+        self.logits = train_model.pi
+
+        self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
+        self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common
+
+        train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss
+
+        self.grads_check_xy0 = grads_xy0 = tf.gradients(train_loss_xy0, params_xy0)
+        if max_grad_norm is not None:
+            grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)
+
+        grads_xy0 = list(zip(grads_xy0, params_xy0))
+        trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
+        _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)
+
+        self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common
+        train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss
+
+        self.grads_check_xy1 = grads_xy1 = tf.gradients(train_loss_xy1, params_xy1)
+        if max_grad_norm is not None:
+            grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)
+
+        grads_xy1 = list(zip(grads_xy1, params_xy1))
+        trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon)
+        _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)
+
+        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+
+        def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
+            advs = td_targets - values
+            for step in range(len(obs)):
+                cur_lr = self.lr.value()
+
+            td_map = {
+                train_model.X: obs,
+                A: actions,
+                XY0: xy0,
+                XY1: xy1,
+                ADV: advs,
+                TD_TARGET: td_targets,
+                PG_LR: cur_lr
+            }
+            if states != []:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+
+            policy_loss, value_loss, policy_entropy, _, policy_loss_xy0, policy_entropy_xy0, _, policy_loss_xy1, policy_entropy_xy1, _ = sess.run([pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map)
+            return policy_loss, value_loss, policy_entropy, policy_loss_xy0, policy_entropy_xy0, policy_loss_xy1, policy_entropy_xy1
+
+        def save(save_path):
+            ps = sess.run(params)
+            joblib.dump(ps, save_path)
+
+        def load(load_path):
+            loaded_params = joblib.load(load_path)
+            restores = []
+            for p, loaded_p in zip(params, loaded_params):
+                restores.append(p.assign(loaded_p))
+            sess.run(restores)
+
+        self.train = train
+        self.save = save
+        self.load = load
+        self.train_model = train_model
+        self.step_model = step_model
+        self.step = step_model.step
+        self.value = step_model.value
+        self.initial_state = step_model.initial_state
+        print("global_variables_initializer start")
+        tf.global_variables_initializer().run(session=sess)
+        print("global_variables_initializer complete")
+
+
+def mse(pred, target):
+    return tf.square(pred - target) / 2.
\ No newline at end of file
diff --git a/a2c/runner.py b/a2c/runner.py
new file mode 100644
index 0000000..723f781
--- /dev/null
+++ b/a2c/runner.py
@@ -0,0 +1,253 @@
+import numpy as np
+from baselines.a2c.utils import discount_with_dones
+from pysc2.lib import actions as sc2_actions
+
+import nsml
+
+
+class Runner(object):
+    def __init__(self, env, model, nsteps, nscripts, nstack, gamma, callback=None):
+        self.env = env
+        self.model = model
+        nh, nw, nc = (32, 32, 3)
+        self.nsteps = nsteps
+        self.nscripts = nscripts
+        self.nenv = nenv = env.num_envs
+        self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)
+        self.batch_coord_shape = (nenv * nsteps, 32)
+        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
+        self.available_actions = None
+        self.base_act_mask = np.full((self.nenv, 2), 0, dtype=np.uint8)
+        obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = env.reset()
+        self.xy_per_marine = [{"0": [0, 0], "1": [0, 0]} for _ in range(nenv)]
+        for env_num, data in enumerate(xy_per_marine):
+            self.xy_per_marine[env_num] = data
+        self.army_counts = army_counts
+        self.control_groups = control_groups
+        self.selected = selected
+        self.update_obs(obs)
+        self.update_available(available_actions)
+        self.gamma = gamma
+        self.states = model.initial_state
+        self.dones = [False for _ in range(nenv)]
+        self.total_reward = [0.0 for _ in range(nenv)]
+        self.episode_rewards = []
+        self.episode_rewards_script = []
+        self.episode_rewards_a2c = []
+        self.episodes = 0
+        self.steps = 0
+        self.callback = callback
+
+        self.action_queue = [[] for _ in range(nenv)]
+        self.group_list = [[] for _ in range(nenv)]
+        self.agent_state = ["IDLE" for _ in range(nenv)]
+        self.dest_per_marine = [{} for _ in range(nenv)]
+
+        self.group_id = [0 for _ in range(nenv)]
+
+    def update_obs(self, obs):
+        obs = np.asarray(obs, dtype=np.int32).swapaxes(1, 2).swapaxes(2, 3)
+        self.obs = np.roll(self.obs, shift=-3, axis=3)
+        new_map = np.zeros((self.nenv, 32, 32, 3))
+        new_map[:, :, :, -1] = obs[:, 0, :, :]
+        for env_num in range(self.nenv):
+            if "0" not in self.xy_per_marine[env_num]:
+                self.xy_per_marine[env_num]["0"] = [0, 0]
+            if "1" not in self.xy_per_marine[env_num]:
+                self.xy_per_marine[env_num]["1"] = [0, 0]
+
+            marine0 = self.xy_per_marine[env_num]["0"]
+            marine1 = self.xy_per_marine[env_num]["1"]
+            new_map[env_num, marine0[0], marine0[1], -3] = 1
+            new_map[env_num, marine1[0], marine1[1], -2] = 1
+        self.obs[:, :, :, -3:] = new_map
+
+    def update_available(self, _available_actions):
+        self.available_actions = _available_actions
+        self.base_act_mask = np.full((self.nenv, 3), 0, dtype=np.uint8)
+        for env_num, list in enumerate(_available_actions):
+            for action_num in list:
+                if action_num == 4:
+                    self.base_act_mask[env_num][0] = 1
+                    self.base_act_mask[env_num][1] = 1
+                elif action_num == 0:
+                    self.base_act_mask[env_num][2] = 1
+
+    def valid_base_action(self, base_actions):
+        for env_num, list in enumerate(self.available_actions):
+            avail = []
+            for action_num in list:
+                if action_num == 4:
+                    avail.append(0)
+                    avail.append(1)
+                elif action_num == 0:
+                    avail.append(2)
+
+            if base_actions[env_num] not in avail:
+                base_actions[env_num] = np.random.choice(avail)
+
+        return base_actions
+
+    def trans_base_actions(self, base_actions):
+        new_base_actions = np.copy(base_actions)
+        for env_num, ba in enumerate(new_base_actions):
+            if ba == 0:
+                new_base_actions[env_num] = 4  # move marine control group 0
+            elif ba == 1:
+                new_base_actions[env_num] = 4  # move marine control group 1
+            elif ba == 2:
+                new_base_actions[env_num] = 0  # move marine control group 1
+
+        return new_base_actions
+
+    def construct_action(self, base_actions, base_action_spec, x0, y0, x1, y1):
+        actions = []
+        for env_num, spec in enumerate(base_action_spec):
+            two_action = []
+            if base_actions[env_num] == 0:
+                two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [0]]))
+                two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x0[env_num]), y0[env_num]]]))
+
+            elif base_actions[env_num] == 1:
+                two_action.append(sc2_actions.FunctionCall(4, [[_CONTROL_GROUP_RECALL], [1]]))
+                two_action.append(sc2_actions.FunctionCall(331, [[_NOT_QUEUED], [int(x1[env_num]), y1[env_num]]]))
+            elif base_actions[env_num] == 2:
+                two_action.append(sc2_actions.FunctionCall(0, []))
+                two_action.append(sc2_actions.FunctionCall(0, []))
+
+            actions.append(two_action)
+
+        return actions
+
+    def run(self):
+        mb_obs, mb_td_targets, mb_base_actions, mb_xy0, mb_xy1, mb_values, mb_dones = [], [], [], [], [], [], []
+
+        mb_states = self.states
+        for n in range(self.nsteps):
+            pi1, pi_xy0, pi_xy1, values, states = self.model.step(self.obs, self.states, self.dones)
+
+            pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3
+
+            base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise, axis=1)
+            xy0 = np.argmax(pi_xy0, axis=1)
+
+            x0 = (xy0 % 32).astype(int)
+            y0 = (xy0 / 32).astype(int)
+
+            xy1 = np.argmax(pi_xy1, axis=1)
+            x1 = (xy1 % 32).astype(int)
+            y1 = (xy1 / 32).astype(int)
+
+            base_actions = self.valid_base_action(base_actions)
+            new_base_actions = self.trans_base_actions(base_actions)
+
+            base_action_spec = self.env.action_spec(new_base_actions)
+            actions = self.construct_action(base_actions, base_action_spec, x0, y0, x1, y1)
+
+            mb_obs.append(np.copy(self.obs))
+            mb_base_actions.append(base_actions)
+
+            mb_xy0.append(xy0)
+            mb_xy1.append(xy1)
+            mb_values.append(values)
+            mb_dones.append(self.dones)
+
+            obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(actions=actions)
+            self.army_counts = army_counts
+            self.control_groups = control_groups
+            self.selected = selected
+            for env_num, data in enumerate(xy_per_marine):
+                self.xy_per_marine[env_num] = data
+            self.update_available(available_actions)
+
+            self.states = states
+            self.dones = dones
+            mean_100ep_reward_a2c = 0
+            for n, done in enumerate(dones):
+                self.total_reward[n] += float(rewards[n])
+                if done:
+                    self.obs[n] = self.obs[n] * 0
+                    self.episodes += 1
+                    num_episodes = self.episodes
+                    self.episode_rewards.append(self.total_reward[n])
+
+                    model = self.model
+                    mean_100ep_reward = round(
+                        np.mean(self.episode_rewards[-101:]), 1)
+                    if n < self.nscripts:  # scripted agents
+                        self.episode_rewards_script.append(
+                            self.total_reward[n])
+                        mean_100ep_reward_script = round(
+                            np.mean(self.episode_rewards_script[-101:]), 1)
+                        nsml.report(
+                            reward_script=self.total_reward[n],
+                            mean_reward_script=mean_100ep_reward_script,
+                            reward=self.total_reward[n],
+                            mean_100ep_reward=mean_100ep_reward,
+                            episodes=self.episodes,
+                            step=self.episodes,
+                            scope=locals()
+                        )
+                    else:
+                        self.episode_rewards_a2c.append(self.total_reward[n])
+                        mean_100ep_reward_a2c = round(
+                            np.mean(self.episode_rewards_a2c[-101:]), 1)
+                        nsml.report(
+                            reward_a2c=self.total_reward[n],
+                            mean_reward_a2c=mean_100ep_reward_a2c,
+                            reward=self.total_reward[n],
+                            mean_100ep_reward=mean_100ep_reward,
+                            episodes=self.episodes,
+                            step=self.episodes,
+                            scope=locals()
+                        )
+                        print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)
+
+                    if self.callback is not None:
+                        self.callback(locals(), globals())
+                    self.total_reward[n] = 0
+                    self.group_list[n] = []
+
+            self.update_obs(obs)
+            mb_td_targets.append(rewards)
+        mb_dones.append(self.dones)
+        # batch of steps to batch of rollouts
+        mb_obs = np.asarray(
+            mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
+            self.batch_ob_shape)
+        mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
+        mb_base_actions = np.asarray(
+            mb_base_actions, dtype=np.int32).swapaxes(1, 0)
+
+        mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
+        mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)
+
+        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
+        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
+        mb_masks = mb_dones[:, :-1]
+        mb_dones = mb_dones[:, 1:]
+        last_values = self.model.value(self.obs, self.states,
+                                       self.dones).tolist()
+        # discount/bootstrap off value fn
+        for n, (rewards, dones, value) in enumerate(
+                zip(mb_td_targets, mb_dones, last_values)):
+            rewards = rewards.tolist()
+            dones = dones.tolist()
+            if dones[-1] == 0:
+                rewards = discount_with_dones(rewards + [value], dones + [0],
+                                              self.gamma)[:-1]
+            else:
+                rewards = discount_with_dones(rewards, dones, self.gamma)
+            mb_td_targets[n] = rewards
+        mb_td_targets = mb_td_targets.flatten()
+        mb_base_actions = mb_base_actions.flatten()
+        mb_xy0 = mb_xy0.flatten()
+        mb_xy1 = mb_xy1.flatten()
+
+        mb_values = mb_values.flatten()
+        mb_masks = mb_masks.flatten()
+        return mb_obs, mb_states, mb_td_targets, mb_masks, mb_base_actions, mb_xy0, mb_xy1, mb_values
+
+
+_CONTROL_GROUP_RECALL = 0
+_NOT_QUEUED = 0
\ No newline at end of file
diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py
index aa0f6a9..6517580 100644
--- a/train_mineral_shards_a2c.py
+++ b/train_mineral_shards_a2c.py
@@ -95,6 +95,15 @@ def a2c_callback(locals, globals):
     logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c'])
     logger.record_tabular("num_episodes", locals['num_episodes'])
     logger.record_tabular("environment_number", locals['env_num'])
+    logger.record_tabular("policy_loss", locals['model'].policy_loss)
+    logger.record_tabular("policy_loss_xy0", locals['model'].policy_loss_xy0)
+    logger.record_tabular("policy_loss_xy1", locals['model'].policy_loss_xy1)
+    logger.record_tabular("policy_entropy", locals['model'].policy_entropy)
+    logger.record_tabular("policy_entropy_xy0", locals['model'].policy_entropy_xy0)
+    logger.record_tabular("policy_entropy_xy1", locals['model'].policy_entropy_xy1)
+    logger.record_tabular("learning_rate_N", locals['model'].lr.n)
+    logger.record_tabular("learning_rate_V", locals['model'].lr.v)
+    logger.record_tabular("value_loss", locals['model'].value_loss)
     logger.record_tabular("done", locals['done'])
 
     if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward:

From f08fa57c1e0824782568e8d1b2c917ed41505ed4 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Tue, 2 Mar 2021 16:30:57 -0500
Subject: [PATCH 07/11] Adding a couple changes before I delete and reclone,
 because having issues with Conda env.

---
 requirements.txt            | 11 +++--
 train_mineral_shards_a2c.py | 86 ++++++++++++++++++++-----------------
 2 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 029d0f6..5a23461 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,12 @@
 git+https://github.com/deepmind/pysc2
 git+https://github.com/openai/baselines
-numpy
-tensorflow
+numpy~=1.19.5
+tensorflow~=1.14.0
 absl-py
 cloudpickle
-dill
\ No newline at end of file
+dill~=0.3.3
+tensorflow-gpu~=1.14.0
+joblib~=1.0.1
+six~=1.15.0
+future~=0.18.2
+s2clientprotocol~=5.0.6.83830.0
\ No newline at end of file
diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py
index 6517580..ed0c835 100644
--- a/train_mineral_shards_a2c.py
+++ b/train_mineral_shards_a2c.py
@@ -3,36 +3,36 @@
 import datetime
 import random
 
-from absl import flags
+import absl
 
-from pysc2.lib import actions
+import pysc2.lib
 from baselines.logger import Logger, TensorBoardOutputFormat
 
-from common.vec_env.subproc_vec_env import SubprocVecEnv
+import common.vec_env.subproc_vec_env
+import a2c
 from a2c.policies import CnnPolicy
-from a2c import a2c
 
-_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id
-_SELECT_ARMY = actions.FUNCTIONS.select_army.id
+_MOVE_SCREEN = pysc2.lib.actions.FUNCTIONS.Move_screen.id
+_SELECT_ARMY = pysc2.lib.actions.FUNCTIONS.select_army.id
 _SELECT_ALL = [0]
 _NOT_QUEUED = [0]
 
 step_mul = 8
 
-FLAGS = flags.FLAGS
-flags.DEFINE_string("map", "CollectMineralShards",
+FLAGS = absl.flags.FLAGS
+absl.flags.DEFINE_string("map", "CollectMineralShards",
                     "Name of a map to use to play.")
 start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
-flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)")
-flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.")
-flags.DEFINE_integer("timesteps", 2000000, "Steps to train")
-flags.DEFINE_float("exploration_fraction", 0.5, "Exploration Fraction")
-flags.DEFINE_boolean("prioritized", True, "prioritized_replay")
-flags.DEFINE_boolean("dueling", True, "dueling")
-flags.DEFINE_float("lr", 0.0005, "Learning rate")
-flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C")
-flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C")
-flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C")
+absl.flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)")
+absl.flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.")
+absl.flags.DEFINE_integer("timesteps", 2000000, "Steps to train")
+absl.flags.DEFINE_float("exploration_fraction", 0.5, "Exploration Fraction")
+absl.flags.DEFINE_boolean("prioritized", True, "prioritized_replay")
+absl.flags.DEFINE_boolean("dueling", True, "dueling")
+absl.flags.DEFINE_float("lr", 0.0005, "Learning rate")
+absl.flags.DEFINE_integer("num_agents", 4, "number of RL agents for A2C")
+absl.flags.DEFINE_integer("num_scripts", 0, "number of script agents for A2C")
+absl.flags.DEFINE_integer("nsteps", 20, "number of batch steps for A2C")
 
 PROJ_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -57,7 +57,9 @@ def main():
     print("random lr : %s" % FLAGS.lr)
     lr_round = round(FLAGS.lr, 8)
 
-    logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round, start_time)
+    logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
+    FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round,
+    start_time)
 
     Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)])
 
@@ -66,11 +68,11 @@ def main():
 
     seed = 0
 
-    env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
-                        FLAGS.map)
+    env = common.vec_env.subproc_vec_env.SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
+                                                       FLAGS.map)
 
     policy_fn = CnnPolicy
-    a2c.learn(
+    a2c.a2c.learn(
         policy_fn,
         env,
         seed,
@@ -87,30 +89,33 @@ def main():
         lrschedule='linear',
         callback=a2c_callback)
 
-from baselines import logger
+
+import baselines
+
 
 def a2c_callback(locals, globals):
     global max_mean_reward, last_filename
 
-    logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c'])
-    logger.record_tabular("num_episodes", locals['num_episodes'])
-    logger.record_tabular("environment_number", locals['env_num'])
-    logger.record_tabular("policy_loss", locals['model'].policy_loss)
-    logger.record_tabular("policy_loss_xy0", locals['model'].policy_loss_xy0)
-    logger.record_tabular("policy_loss_xy1", locals['model'].policy_loss_xy1)
-    logger.record_tabular("policy_entropy", locals['model'].policy_entropy)
-    logger.record_tabular("policy_entropy_xy0", locals['model'].policy_entropy_xy0)
-    logger.record_tabular("policy_entropy_xy1", locals['model'].policy_entropy_xy1)
-    logger.record_tabular("learning_rate_N", locals['model'].lr.n)
-    logger.record_tabular("learning_rate_V", locals['model'].lr.v)
-    logger.record_tabular("value_loss", locals['model'].value_loss)
-    logger.record_tabular("done", locals['done'])
-
-    if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward:
+    baselines.logger.record_tabular("mean 100 episode reward a2c", locals['mean_100ep_reward_a2c'])
+    baselines.logger.record_tabular("num_episodes", locals['num_episodes'])
+    baselines.logger.record_tabular("environment_number", locals['env_num'])
+    baselines.logger.record_tabular("policy_loss", locals['model'].policy_loss)
+    baselines.logger.record_tabular("policy_loss_xy0", locals['model'].policy_loss_xy0)
+    baselines.logger.record_tabular("policy_loss_xy1", locals['model'].policy_loss_xy1)
+    baselines.logger.record_tabular("policy_entropy", locals['model'].policy_entropy)
+    baselines.logger.record_tabular("policy_entropy_xy0", locals['model'].policy_entropy_xy0)
+    baselines.logger.record_tabular("policy_entropy_xy1", locals['model'].policy_entropy_xy1)
+    baselines.logger.record_tabular("learning_rate_N", locals['model'].lr.n)
+    baselines.logger.record_tabular("learning_rate_V", locals['model'].lr.v)
+    baselines.logger.record_tabular("value_loss", locals['model'].value_loss)
+    baselines.logger.record_tabular("done", locals['done'])
+
+    if 'mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals[
+        'mean_100ep_reward_a2c'] > max_mean_reward:
         print("mean_100ep_reward_a2c : %s max_mean_reward : %s" %
               (locals['mean_100ep_reward_a2c'], max_mean_reward))
         max_mean_reward = locals['mean_100ep_reward_a2c']
-        logger.record_tabular("max_mean_reward", max_mean_reward)
+        baselines.logger.record_tabular("max_mean_reward", max_mean_reward)
 
         if not os.path.exists(os.path.join(PROJ_DIR, 'models/a2c/')):
             try:
@@ -133,7 +138,8 @@ def a2c_callback(locals, globals):
         print("save best mean_100ep_reward model to %s" % filename)
         last_filename = filename
 
-    logger.dump_tabular()
+    baselines.logger.dump_tabular()
+
 
 if __name__ == '__main__':
     main()

From 58cc917d522647fe825c74b59e8970c79e771c56 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Tue, 2 Mar 2021 17:16:41 -0500
Subject: [PATCH 08/11] Making corrections to requirements.txt

---
 requirements.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5a23461..ec3d62e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,7 @@
-git+https://github.com/deepmind/pysc2
-git+https://github.com/openai/baselines
-numpy~=1.19.5
-tensorflow~=1.14.0
 absl-py
-cloudpickle
+cloudpickle==1.2.0
 dill~=0.3.3
-tensorflow-gpu~=1.14.0
+tensorflow-gpu==1.14.0
 joblib~=1.0.1
 six~=1.15.0
 future~=0.18.2

From 9228ef3608b960ce810d1ef9ecd348c95d1a7005 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Tue, 2 Mar 2021 17:17:33 -0500
Subject: [PATCH 09/11] Making corrections to requirements.txt

---
 requirements.txt | 54 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ec3d62e..f801d25 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,50 @@
-absl-py
+absl-py==0.11.0
+astor==0.8.1
+baselines==0.1.6
+cached-property==1.5.2
+certifi==2020.12.5
+chardet==4.0.0
+click==8.0.0a1
 cloudpickle==1.2.0
-dill~=0.3.3
+deepdiff==5.2.3
+dill==0.3.3
+enum34==1.1.10
+future==0.18.2
+gast==0.4.0
+google-pasta==0.2.0
+grpcio==1.36.0
+gym==0.15.7
+h5py==3.1.0
+idna==2.10
+importlib-metadata==3.7.0
+joblib==1.0.1
+Keras-Applications==1.0.8
+Keras-Preprocessing==1.1.2
+Markdown==3.3.4
+mock==4.0.3
+mpyq==0.2.5
+numpy==1.20.1
+opencv-python==4.5.1.48
+ordered-set==4.0.2
+portpicker==1.3.1
+protobuf==3.15.3
+pygame==2.0.1
+pysc2==2.0
+requests==2.25.1
+s2clientprotocol==5.0.6.83830.0
+s2protocol==5.0.6.83830.0
+scipy==1.6.1
+six==1.15.0
+sk-video==1.1.10
+tensorboard==1.14.0
+tensorflow-estimator==1.14.0
 tensorflow-gpu==1.14.0
-joblib~=1.0.1
-six~=1.15.0
-future~=0.18.2
-s2clientprotocol~=5.0.6.83830.0
\ No newline at end of file
+termcolor==1.1.0
+tqdm==4.58.0
+typing-extensions==3.7.4.3
+urllib3==1.26.3
+websocket-client==0.57.0
+Werkzeug==1.0.1
+whichcraft==0.6.1
+wrapt==1.12.1
+zipp==3.4.0

From a25a1d682905c1efebf8113ea61272e543931d94 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Tue, 2 Mar 2021 19:20:01 -0500
Subject: [PATCH 10/11] Finally got things working again, and this time with a
 mostly legit requirements.txt file.

---
 train_mineral_shards_a2c.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/train_mineral_shards_a2c.py b/train_mineral_shards_a2c.py
index ed0c835..647ea01 100644
--- a/train_mineral_shards_a2c.py
+++ b/train_mineral_shards_a2c.py
@@ -4,12 +4,13 @@
 import random
 
 import absl
+import baselines
 
 import pysc2.lib
 from baselines.logger import Logger, TensorBoardOutputFormat
 
 import common.vec_env.subproc_vec_env
-import a2c
+from a2c.a2c import learn
 from a2c.policies import CnnPolicy
 
 _MOVE_SCREEN = pysc2.lib.actions.FUNCTIONS.Move_screen.id
@@ -20,8 +21,13 @@
 step_mul = 8
 
 FLAGS = absl.flags.FLAGS
+
+import sys
+
+FLAGS(sys.argv)
+
 absl.flags.DEFINE_string("map", "CollectMineralShards",
-                    "Name of a map to use to play.")
+                         "Name of a map to use to play.")
 start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
 absl.flags.DEFINE_string("log", "tensorboard", "logging type(stdout, tensorboard)")
 absl.flags.DEFINE_string("algorithm", "a2c", "RL algorithm to use.")
@@ -58,8 +64,9 @@ def main():
     lr_round = round(FLAGS.lr, 8)
 
     logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
-    FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps, lr_round,
-    start_time)
+        FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts, FLAGS.nsteps,
+        lr_round,
+        start_time)
 
     Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)])
 
@@ -72,7 +79,7 @@ def main():
                                                        FLAGS.map)
 
     policy_fn = CnnPolicy
-    a2c.a2c.learn(
+    learn(
         policy_fn,
         env,
         seed,
@@ -90,9 +97,6 @@ def main():
         callback=a2c_callback)
 
 
-import baselines
-
-
 def a2c_callback(locals, globals):
     global max_mean_reward, last_filename
 

From 9035a5d5155e2f8110483b786dbc8684ab19aa32 Mon Sep 17 00:00:00 2001
From: rwill128 <rwill128.create@gmail.com>
Date: Tue, 2 Mar 2021 19:40:24 -0500
Subject: [PATCH 11/11] Making corrections to requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f801d25..4450412 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -38,7 +38,7 @@ six==1.15.0
 sk-video==1.1.10
 tensorboard==1.14.0
 tensorflow-estimator==1.14.0
-tensorflow-gpu==1.14.0
+tensorflow==1.14.0
 termcolor==1.1.0
 tqdm==4.58.0
 typing-extensions==3.7.4.3