From c9245ea872610aa4fcc54aacdc1f6284a797afac Mon Sep 17 00:00:00 2001 From: verbose-void Date: Sun, 3 Nov 2024 15:39:58 -0800 Subject: [PATCH 1/4] average reward per frame --- interactor.py | 10 +++++++++- train_doom.py | 13 +++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/interactor.py b/interactor.py index dca5e8c..ab4b97f 100644 --- a/interactor.py +++ b/interactor.py @@ -101,8 +101,14 @@ def __init__(self, num_envs: int, watch: bool = False, env_id: str = "VizdoomCor cv2.namedWindow("screen", cv2.WINDOW_NORMAL) cv2.resizeWindow("screen", *DISPLAY_SIZE) + @property + def avg_rew_per_frame(self): + return self.current_episode_cumulative_rewards / (self.step_counter + 1) + def reset(self): self.current_episode_cumulative_rewards = torch.zeros(self.num_envs, dtype=torch.float32) + self.step_counter = torch.zeros(self.num_envs, dtype=torch.int32) + return self.env.reset() def step(self, actions=None): @@ -112,6 +118,7 @@ def step(self, actions=None): # Step the environments with the sampled actions observations, rewards, dones, infos = self.env.step(actions) self.current_episode_cumulative_rewards += rewards + self.step_counter += 1 # Show the screen from the 0th environment if watch is enabled if self.watch: @@ -123,7 +130,7 @@ def step(self, actions=None): cv2.putText(screen, f"Env: {self.watch_index}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) # also display the current reward - cv2.putText(screen, f"Ep Reward: {self.current_episode_cumulative_rewards[self.watch_index]}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) + cv2.putText(screen, f"Avg Reward per Frame: {self.avg_rew_per_frame[self.watch_index]}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.imshow("screen", screen) cv2.waitKey(1) # Display for 1 ms @@ -132,6 +139,7 @@ def step(self, actions=None): for i in range(self.num_envs): if dones[i]: self.current_episode_cumulative_rewards[i] = 0 + self.step_counter[i] = 0 # Return the results return observations, rewards, dones, infos diff --git a/train_doom.py b/train_doom.py index 03db2d4..3af5e99 100644 --- a/train_doom.py +++ b/train_doom.py @@ -228,7 +228,6 @@ def mini_cli(): observations = interactor.reset() cumulative_rewards_no_reset = torch.zeros((NUM_ENVS,)) - step_counters = torch.zeros((NUM_ENVS,), dtype=torch.float32) optimizer = torch.optim.Adam(agent.parameters(), lr=LR) @@ -291,17 +290,13 @@ def mini_cli(): episodic_rewards.append(interactor.current_episode_cumulative_rewards[i].item()) # TODO: criteria for best episode maybe should be most kills - if interactor.current_episode_cumulative_rewards[i].item() > best_episode_cumulative_reward: - best_episode_cumulative_reward = interactor.current_episode_cumulative_rewards[i].item() + if interactor.avg_rew_per_frame[i].item() > best_episode_cumulative_reward: + best_episode_cumulative_reward = interactor.avg_rew_per_frame[i].item() best_episode_env = i # Track which environment achieved the best reward best_episode = int(video_storage.episode_counters[i].item()) # Track the episode number episodic_rewards = torch.tensor(episodic_rewards) - # count the number of steps taken (reset if done) - step_counters += 1 - step_counters *= 1 - dones.float() - # call agent.reset with done flags for hidden state resetting agent.reset(dones) @@ -310,7 +305,7 @@ def mini_cli(): if TRAIN_ON_CUMULATIVE_REWARDS: # cumulative rewards if NORM_WITH_REWARD_COUNTER: - scores = interactor.current_episode_cumulative_rewards / step_counters + scores = interactor.avg_rew_per_frame else: scores = interactor.current_episode_cumulative_rewards else: @@ -328,6 +323,8 @@ def mini_cli(): print(f"------------- {step_i} -------------") print(f"Loss:\t\t{loss.item():.4f}") + print(f"Norm Scores:\t{norm_scores.mean().item():.4f}") + print(f"Scores:\t\t{scores.mean().item():.4f}") print(f"Entropy:\t{entropy.mean().item():.4f}") print(f"Log Prob:\t{log_probs.mean().item():.4f}") print(f"Reward:\t\t{rewards.mean().item():.4f}") From 9c85bf4b2a2b2a4006d24e07c6f69184e399ce1a Mon Sep 17 00:00:00 2001 From: verbose-void Date: Sun, 3 Nov 2024 15:45:25 -0800 Subject: [PATCH 2/4] reset with <-1 rew per frame --- interactor.py | 9 ++++++++- train_doom.py | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/interactor.py b/interactor.py index ab4b97f..2f2829f 100644 --- a/interactor.py +++ b/interactor.py @@ -130,13 +130,20 @@ def step(self, actions=None): cv2.putText(screen, f"Env: {self.watch_index}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) # also display the current reward - cv2.putText(screen, f"Avg Reward per Frame: {self.avg_rew_per_frame[self.watch_index]}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) + cv2.putText(screen, f"Avg Reward per Frame: {self.avg_rew_per_frame[self.watch_index]:.4f}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.imshow("screen", screen) cv2.waitKey(1) # Display for 1 ms # reset the reward sums for the environments that are done for i in range(self.num_envs): + # if the average reward per frame falls below -1, let's reset + if self.avg_rew_per_frame[i] < -1: + new_obs, _ = self.env.envs[i].reset() + observations[i] = torch.tensor(new_obs["screen"], dtype=torch.uint8) + self.current_episode_cumulative_rewards[i] = 0 + self.step_counter[i] = 0 + if dones[i]: self.current_episode_cumulative_rewards[i] = 0 self.step_counter[i] = 0 diff --git a/train_doom.py b/train_doom.py index 3af5e99..e6fae5d 100644 --- a/train_doom.py +++ b/train_doom.py @@ -323,8 +323,9 @@ def mini_cli(): print(f"------------- {step_i} -------------") print(f"Loss:\t\t{loss.item():.4f}") - print(f"Norm Scores:\t{norm_scores.mean().item():.4f}") - print(f"Scores:\t\t{scores.mean().item():.4f}") + # print(f"Norm Scores:\t{norm_scores.mean().item():.4f}") + # print(f"Scores:\t\t{scores.mean().item():.4f}") + print(f"Avg Rew Per Frame:\t{interactor.avg_rew_per_frame.mean().item():.4f}") print(f"Entropy:\t{entropy.mean().item():.4f}") print(f"Log Prob:\t{log_probs.mean().item():.4f}") print(f"Reward:\t\t{rewards.mean().item():.4f}") From ed5dc9747e6b5a2d3206b57597bbf82fc0d9b964 Mon Sep 17 00:00:00 2001 From: verbose-void Date: Mon, 4 Nov 2024 21:57:24 -0800 Subject: [PATCH 3/4] bug fix --- custom_doom.py | 2 +- interactor.py | 27 +++++++++++---------------- scenarios/freedom_custom.cfg | 4 ++-- train_doom.py | 8 +++++--- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/custom_doom.py b/custom_doom.py index 4e0ba6f..9e37de4 100644 --- a/custom_doom.py +++ b/custom_doom.py @@ -163,7 +163,7 @@ def _get_reward(self): # decrement reward for firing a weapon, unless we hit or killed an enemy landed_shot = deltas.KILLCOUNT != 0 or deltas.HITCOUNT != 0 if not landed_shot: - reward += deltas.SELECTED_WEAPON_AMMO * 30 + reward += deltas.SELECTED_WEAPON_AMMO * 10 # decrement reward for taking damage (already covered in HEALTH and ARMOR) # reward -= deltas.DAMAGE_TAKEN * 10 diff --git a/interactor.py b/interactor.py index 2f2829f..4aa6572 100644 --- a/interactor.py +++ b/interactor.py @@ -31,8 +31,6 @@ def __init__(self, num_envs: int, env_id: str): else: self.envs = [gymnasium.make(env_id) for _ in range(num_envs)] - self.dones = [False] * num_envs - # Pre-allocate observation and reward tensors first_obs_space = self.envs[0].observation_space['screen'] self.obs_shape = first_obs_space.shape @@ -44,7 +42,7 @@ def reset(self): for i in range(self.num_envs): obs, _ = self.envs[i].reset() self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor - self.dones[i] = False + self.dones_tensor[i] = False return self.observations def step(self, actions): @@ -54,25 +52,22 @@ def step(self, actions): all_infos = [] + self.dones_tensor[:] = False + for i in range(self.num_envs): - if self.dones[i]: + obs, reward, terminated, truncated, infos = self.envs[i].step(actions[i]) + self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor + self.rewards[i] = reward + done = terminated or truncated + self.dones_tensor[i] = done + + if done: # Reset the environment if it was done in the last step obs, infos = self.envs[i].reset() self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor self.rewards[i] = 0 # No reward on reset - self.dones_tensor[i] = False - self.dones[i] = False - - all_infos.append(infos) - else: - obs, reward, terminated, truncated, infos = self.envs[i].step(actions[i]) - self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor - self.rewards[i] = reward - done = terminated or truncated - self.dones_tensor[i] = done - self.dones[i] = done - all_infos.append(infos) + all_infos.append(infos) return self.observations, self.rewards, self.dones_tensor, all_infos diff --git a/scenarios/freedom_custom.cfg b/scenarios/freedom_custom.cfg index fb02a7f..59d241f 100644 --- a/scenarios/freedom_custom.cfg +++ b/scenarios/freedom_custom.cfg @@ -12,8 +12,8 @@ render_decals = false render_particles = false window_visible = false -doom_scenario_path = freedoom2.wad -doom_map = map02 +# doom_scenario_path = freedoom2.wad +doom_map = map01 doom_skill=1 # Each step is good for you! diff --git a/train_doom.py b/train_doom.py index e6fae5d..2e4882c 100644 --- a/train_doom.py +++ b/train_doom.py @@ -200,10 +200,11 @@ def mini_cli(): GRID_SIZE = int(np.ceil(np.sqrt(NUM_ENVS))) # Dynamically determine the grid size # LR = 1e-4 # works well for corridor - LR = 1e-3 + LR = 3e-3 TRAIN_ON_CUMULATIVE_REWARDS = True NORM_WITH_REWARD_COUNTER = True + BATCH_NORM_SCORES = True # episode tracking (for video saving and replay) MAX_VIDEO_FRAMES = 1024 # will be clipped if a best episode is found to log to wandb @@ -312,11 +313,12 @@ def mini_cli(): # instantaneous rewards scores = rewards - norm_scores = (scores - scores.mean()) / (scores.std() + 1e-8) + if BATCH_NORM_SCORES: + scores = (scores - scores.mean()) / (scores.std() + 1e-8) # specifically symlog after normalizing scores # norm_scores = symlog_torch(norm_scores) - loss = (-log_probs * norm_scores.to(device)).mean() + loss = (-log_probs * scores.to(device)).mean() loss.backward() optimizer.step() From e0ac899c1756f56bf73544339f5056675bdb3200 Mon Sep 17 00:00:00 2001 From: verbose-void Date: Mon, 4 Nov 2024 22:13:40 -0800 Subject: [PATCH 4/4] rew updates --- custom_doom.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/custom_doom.py b/custom_doom.py index 9e37de4..095906c 100644 --- a/custom_doom.py +++ b/custom_doom.py @@ -134,7 +134,7 @@ def _get_reward(self): # get deltas deltas = self._current_reward_features.get_deltas(self._prev_reward_features) - reward += deltas.KILLCOUNT * 1000 + reward += deltas.KILLCOUNT * 2000 reward += deltas.ITEMCOUNT * 10 reward += deltas.SECRETCOUNT * 3000 # reward += deltas.HITCOUNT * 100 @@ -158,11 +158,14 @@ def _get_reward(self): # any ammo decrease should be ignored. if deltas.SELECTED_WEAPON != 0: # if we changed weapons, ignore ammo change reward, but give a nice reward - reward += 1000 + # reward += 1000 # NOTE: this is buggy,.. gives a large reward for shooting away all ammo + pass else: # decrement reward for firing a weapon, unless we hit or killed an enemy landed_shot = deltas.KILLCOUNT != 0 or deltas.HITCOUNT != 0 - if not landed_shot: + if landed_shot: + reward += 300 + else: reward += deltas.SELECTED_WEAPON_AMMO * 10 # decrement reward for taking damage (already covered in HEALTH and ARMOR)