diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/Solvers/Policy_Iteration.py b/Solvers/Policy_Iteration.py index 06f8f32..4a33c7a 100644 --- a/Solvers/Policy_Iteration.py +++ b/Solvers/Policy_Iteration.py @@ -57,7 +57,9 @@ def train_episode(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ - + A = self.one_step_lookahead(s) + best_action = np.argmax(A) + self.policy[s, :] = np.eye(self.env.action_space.n)[best_action] # In DP methods we don't interact with the environment so we will set the reward to be the sum of state values # and the number of steps to -1 representing an invalid value @@ -103,6 +105,20 @@ def policy_eval(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ + num_states = self.env.observation_space.n + num_actions = self.env.action_space.n + gamma = self.options.gamma + + # Build P_pi and R_pi + P_pi = np.zeros((num_states, num_states)) + R_pi = np.zeros(num_states) + for s in range(num_states): + a = np.argmax(self.policy[s]) + for prob, next_state, reward, done in self.env.P[s][a]: + P_pi[s, next_state] += prob + R_pi[s] += prob * reward + + self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi) def create_greedy_policy(self): """ diff --git a/Solvers/Value_Iteration.py b/Solvers/Value_Iteration.py index 0054241..aea5223 100644 --- a/Solvers/Value_Iteration.py +++ b/Solvers/Value_Iteration.py @@ -71,6 +71,9 @@ def train_episode(self): ################################ # YOUR IMPLEMENTATION HERE # ################################ + values = self.one_step_lookahead(each_state) + best_action_value = np.max(values) + self.V[each_state] = best_action_value # Dont worry about this part self.statistics[Statistics.Rewards.value] = np.sum(self.V) @@ -140,7 +143,9 @@ def policy_fn(state): ################################ # YOUR IMPLEMENTATION HERE # ################################ - + values = self.one_step_lookahead(state) + best_action = np.argmax(values) + return best_action return policy_fn @@ -192,6 +197,14 @@ def train_episode(self): # Do a one-step lookahead to find the best action # # Update the value function. Ref: Sutton book eq. 4.10. # ######################################################### + for s in range(self.env.observation_space.n): + # Do a one-step lookahead to find the best action + A = self.one_step_lookahead(s) + best_action_value = np.max(A) + priority = -abs(self.V[s] - best_action_value) + self.pq.update(s, priority) + state = self.pq.pop() + self.V[state] = self.one_step_lookahead(state).max() # you can ignore this part self.statistics[Statistics.Rewards.value] = np.sum(self.V) diff --git a/environment_mac_mod.yml b/environment_mac_mod.yml new file mode 100644 index 0000000..8eb01bc --- /dev/null +++ b/environment_mac_mod.yml @@ -0,0 +1,98 @@ +name: csce642 +channels: + - conda-forge + - defaults +dependencies: + - python=3.9.16 + - numpy=1.23.5 + - swig + - box2d-py + - pip + - pip: + - absl-py==1.4.0 + - ale-py==0.8.1 + - asttokens==2.2.1 + - backcall==0.2.0 + - backports.functools-lru-cache==1.6.5 + - certifi==2023.7.22 + - cffi==1.15.1 + - charset-normalizer==3.2.0 + - cloudpickle==2.2.1 + - cmake==3.27.2 + - contourpy==1.1.0 + - cycler==0.11.0 + - Cython==3.0.0 + - debugpy==1.6.7 + - decorator==4.4.2 + - entrypoints==0.4 + - executing==1.2.0 + - Farama-Notifications==0.0.4 + - fasteners==0.18 + - filelock==3.12.2 + - fonttools==4.42.0 + - glfw==2.6.2 + - gymnasium==0.29.0 + - idna==3.4 + - imageio==2.31.1 + - imageio-ffmpeg==0.4.8 + - importlib-metadata==6.8.0 + - importlib-resources==6.0.1 + - ipykernel==6.14.0 + - ipython==8.4.0 + - jedi==0.19.0 + - Jinja2==3.1.2 + - joblib==1.3.2 + - jupyter-client>=7.4.4 + - jupyter_core==5.3.1 + - kiwisolver==1.4.4 + - lit==16.0.6 + - lz4==4.3.2 + - MarkupSafe==2.1.3 + - matplotlib==3.7.2 + - matplotlib-inline==0.1.6 + - ml-dtypes>=0.3.1 + - moviepy==1.0.3 + - mpmath==1.3.0 + - nest-asyncio==1.5.6 + - networkx==3.1 + - opencv-python==4.8.0.76 + - opt-einsum==3.3.0 + - packaging==23.1 + - pandas==2.0.3 + - parso==0.8.3 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - Pillow==10.0.0 + - platformdirs==3.10.0 + - proglog==0.1.10 + - prompt-toolkit==3.0.39 + - psutil==5.9.0 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pycparser==2.21 + - pygame==2.5.1 + - Pygments==2.16.1 + - pynput==1.7.6 + - PyOpenGL==3.1.7 + - pyparsing==3.0.9 + - python-dateutil==2.8.2 + - python-xlib==0.33 + - pytz==2023.3 + - requests==2.31.0 + - scikit-learn==1.3.0 + - setuptools==68.0.0 + - Shimmy==0.2.1 + - six==1.16.0 + - stack-data==0.6.2 + - sympy==1.12 + - threadpoolctl==3.2.0 + - torch==2.0.1 + - tornado>=6.2.0 + - tqdm==4.66.1 + - traitlets==5.9.0 + - typing_extensions==4.7.1 + - tzdata==2023.3 + - urllib3==2.0.4 + - wcwidth==0.2.6 + - wheel==0.38.4 + - zipp==3.16.2 \ No newline at end of file