Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 17 additions & 1 deletion Solvers/Policy_Iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def train_episode(self):
################################
# YOUR IMPLEMENTATION HERE #
################################

A = self.one_step_lookahead(s)
best_action = np.argmax(A)
self.policy[s, :] = np.eye(self.env.action_space.n)[best_action]

# In DP methods we don't interact with the environment so we will set the reward to be the sum of state values
# and the number of steps to -1 representing an invalid value
Expand Down Expand Up @@ -103,6 +105,20 @@ def policy_eval(self):
################################
# YOUR IMPLEMENTATION HERE #
################################
num_states = self.env.observation_space.n
num_actions = self.env.action_space.n
gamma = self.options.gamma

# Build P_pi and R_pi
P_pi = np.zeros((num_states, num_states))
R_pi = np.zeros(num_states)
for s in range(num_states):
a = np.argmax(self.policy[s])
for prob, next_state, reward, done in self.env.P[s][a]:
P_pi[s, next_state] += prob
R_pi[s] += prob * reward

self.V = np.linalg.solve(np.eye(num_states) - gamma * P_pi, R_pi)

def create_greedy_policy(self):
"""
Expand Down
15 changes: 14 additions & 1 deletion Solvers/Value_Iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def train_episode(self):
################################
# YOUR IMPLEMENTATION HERE #
################################
values = self.one_step_lookahead(each_state)
best_action_value = np.max(values)
self.V[each_state] = best_action_value

# Dont worry about this part
self.statistics[Statistics.Rewards.value] = np.sum(self.V)
Expand Down Expand Up @@ -140,7 +143,9 @@ def policy_fn(state):
################################
# YOUR IMPLEMENTATION HERE #
################################

values = self.one_step_lookahead(state)
best_action = np.argmax(values)
return best_action

return policy_fn

Expand Down Expand Up @@ -192,6 +197,14 @@ def train_episode(self):
# Do a one-step lookahead to find the best action #
# Update the value function. Ref: Sutton book eq. 4.10. #
#########################################################
for s in range(self.env.observation_space.n):
# Do a one-step lookahead to find the best action
A = self.one_step_lookahead(s)
best_action_value = np.max(A)
priority = -abs(self.V[s] - best_action_value)
self.pq.update(s, priority)
state = self.pq.pop()
self.V[state] = self.one_step_lookahead(state).max()

# you can ignore this part
self.statistics[Statistics.Rewards.value] = np.sum(self.V)
Expand Down
98 changes: 98 additions & 0 deletions environment_mac_mod.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
name: csce642
channels:
- conda-forge
- defaults
dependencies:
- python=3.9.16
- numpy=1.23.5
- swig
- box2d-py
- pip
- pip:
- absl-py==1.4.0
- ale-py==0.8.1
- asttokens==2.2.1
- backcall==0.2.0
- backports.functools-lru-cache==1.6.5
- certifi==2023.7.22
- cffi==1.15.1
- charset-normalizer==3.2.0
- cloudpickle==2.2.1
- cmake==3.27.2
- contourpy==1.1.0
- cycler==0.11.0
- Cython==3.0.0
- debugpy==1.6.7
- decorator==4.4.2
- entrypoints==0.4
- executing==1.2.0
- Farama-Notifications==0.0.4
- fasteners==0.18
- filelock==3.12.2
- fonttools==4.42.0
- glfw==2.6.2
- gymnasium==0.29.0
- idna==3.4
- imageio==2.31.1
- imageio-ffmpeg==0.4.8
- importlib-metadata==6.8.0
- importlib-resources==6.0.1
- ipykernel==6.14.0
- ipython==8.4.0
- jedi==0.19.0
- Jinja2==3.1.2
- joblib==1.3.2
- jupyter-client>=7.4.4
- jupyter_core==5.3.1
- kiwisolver==1.4.4
- lit==16.0.6
- lz4==4.3.2
- MarkupSafe==2.1.3
- matplotlib==3.7.2
- matplotlib-inline==0.1.6
- ml-dtypes>=0.3.1
- moviepy==1.0.3
- mpmath==1.3.0
- nest-asyncio==1.5.6
- networkx==3.1
- opencv-python==4.8.0.76
- opt-einsum==3.3.0
- packaging==23.1
- pandas==2.0.3
- parso==0.8.3
- pexpect==4.8.0
- pickleshare==0.7.5
- Pillow==10.0.0
- platformdirs==3.10.0
- proglog==0.1.10
- prompt-toolkit==3.0.39
- psutil==5.9.0
- ptyprocess==0.7.0
- pure-eval==0.2.2
- pycparser==2.21
- pygame==2.5.1
- Pygments==2.16.1
- pynput==1.7.6
- PyOpenGL==3.1.7
- pyparsing==3.0.9
- python-dateutil==2.8.2
- python-xlib==0.33
- pytz==2023.3
- requests==2.31.0
- scikit-learn==1.3.0
- setuptools==68.0.0
- Shimmy==0.2.1
- six==1.16.0
- stack-data==0.6.2
- sympy==1.12
- threadpoolctl==3.2.0
- torch==2.0.1
- tornado>=6.2.0
- tqdm==4.66.1
- traitlets==5.9.0
- typing_extensions==4.7.1
- tzdata==2023.3
- urllib3==2.0.4
- wcwidth==0.2.6
- wheel==0.38.4
- zipp==3.16.2