Pi-Star-Lab · MeFredFeng · Sep 7, 2025 · Sep 16, 2025 · Sep 21, 2025
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py
@@ -57,7 +57,9 @@ def train_episode(self):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
-
+            A = self.one_step_lookahead(s)
+            best_action = np.argmax(A)
+            self.policy[s, :] = np.eye(self.env.action_space.n)[best_action]
 
         # In DP methods we don't interact with the environment so we will set the reward to be the sum of state values
         # and the number of steps to -1 representing an invalid value
@@ -103,6 +105,20 @@ def policy_eval(self):
         ################################
         #   YOUR IMPLEMENTATION HERE   #
         ################################
+        num_states = self.env.observation_space.n
+        num_actions = self.env.action_space.n
+        gamma = self.options.gamma
+
+        # Build P_pi and R_pi
+        P_pi = np.zeros((num_states, num_states))
+        R_pi = np.zeros(num_states)
+        for s in range(num_states):
+            a = np.argmax(self.policy[s])
+            for prob, next_state, reward, done in self.env.P[s][a]:
+                P_pi[s, next_state] += prob
+                R_pi[s] += prob * reward
+
+        self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi)
 
     def create_greedy_policy(self):
         """

diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py
@@ -71,6 +71,9 @@ def train_episode(self):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
+            values = self.one_step_lookahead(each_state)
+            best_action_value = np.max(values)
+            self.V[each_state] = best_action_value
 
         # Dont worry about this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)
@@ -140,7 +143,9 @@ def policy_fn(state):
             ################################
             #   YOUR IMPLEMENTATION HERE   #
             ################################
-
+            values = self.one_step_lookahead(state)
+            best_action = np.argmax(values)
+            return best_action
 
         return policy_fn
 
@@ -192,6 +197,14 @@ def train_episode(self):
         # Do a one-step lookahead to find the best action       #
         # Update the value function. Ref: Sutton book eq. 4.10. #
         #########################################################
+        for s in range(self.env.observation_space.n):
+            # Do a one-step lookahead to find the best action
+            A = self.one_step_lookahead(s)
+            best_action_value = np.max(A)
+            priority = -abs(self.V[s] - best_action_value)
+            self.pq.update(s, priority)
+        state = self.pq.pop()
+        self.V[state] = self.one_step_lookahead(state).max()
 
         # you can ignore this part
         self.statistics[Statistics.Rewards.value] = np.sum(self.V)

diff --git a/environment_mac_mod.yml b/environment_mac_mod.yml
@@ -0,0 +1,98 @@
+name: csce642
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9.16
+  - numpy=1.23.5
+  - swig
+  - box2d-py
+  - pip
+  - pip:
+      - absl-py==1.4.0
+      - ale-py==0.8.1
+      - asttokens==2.2.1
+      - backcall==0.2.0
+      - backports.functools-lru-cache==1.6.5
+      - certifi==2023.7.22
+      - cffi==1.15.1
+      - charset-normalizer==3.2.0
+      - cloudpickle==2.2.1
+      - cmake==3.27.2
+      - contourpy==1.1.0
+      - cycler==0.11.0
+      - Cython==3.0.0
+      - debugpy==1.6.7
+      - decorator==4.4.2
+      - entrypoints==0.4
+      - executing==1.2.0
+      - Farama-Notifications==0.0.4
+      - fasteners==0.18
+      - filelock==3.12.2
+      - fonttools==4.42.0
+      - glfw==2.6.2
+      - gymnasium==0.29.0
+      - idna==3.4
+      - imageio==2.31.1
+      - imageio-ffmpeg==0.4.8
+      - importlib-metadata==6.8.0
+      - importlib-resources==6.0.1
+      - ipykernel==6.14.0
+      - ipython==8.4.0
+      - jedi==0.19.0
+      - Jinja2==3.1.2
+      - joblib==1.3.2
+      - jupyter-client>=7.4.4
+      - jupyter_core==5.3.1
+      - kiwisolver==1.4.4
+      - lit==16.0.6
+      - lz4==4.3.2
+      - MarkupSafe==2.1.3
+      - matplotlib==3.7.2
+      - matplotlib-inline==0.1.6
+      - ml-dtypes>=0.3.1
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - nest-asyncio==1.5.6
+      - networkx==3.1
+      - opencv-python==4.8.0.76
+      - opt-einsum==3.3.0
+      - packaging==23.1
+      - pandas==2.0.3
+      - parso==0.8.3
+      - pexpect==4.8.0
+      - pickleshare==0.7.5
+      - Pillow==10.0.0
+      - platformdirs==3.10.0
+      - proglog==0.1.10
+      - prompt-toolkit==3.0.39
+      - psutil==5.9.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pycparser==2.21
+      - pygame==2.5.1
+      - Pygments==2.16.1
+      - pynput==1.7.6
+      - PyOpenGL==3.1.7
+      - pyparsing==3.0.9
+      - python-dateutil==2.8.2
+      - python-xlib==0.33
+      - pytz==2023.3
+      - requests==2.31.0
+      - scikit-learn==1.3.0
+      - setuptools==68.0.0
+      - Shimmy==0.2.1
+      - six==1.16.0
+      - stack-data==0.6.2
+      - sympy==1.12
+      - threadpoolctl==3.2.0
+      - torch==2.0.1
+      - tornado>=6.2.0
+      - tqdm==4.66.1
+      - traitlets==5.9.0
+      - typing_extensions==4.7.1
+      - tzdata==2023.3
+      - urllib3==2.0.4
+      - wcwidth==0.2.6
+      - wheel==0.38.4
+      - zipp==3.16.2