diff --git a/custom_doom.py b/custom_doom.py index 4e0ba6f..095906c 100644 --- a/custom_doom.py +++ b/custom_doom.py @@ -134,7 +134,7 @@ def _get_reward(self): # get deltas deltas = self._current_reward_features.get_deltas(self._prev_reward_features) - reward += deltas.KILLCOUNT * 1000 + reward += deltas.KILLCOUNT * 2000 reward += deltas.ITEMCOUNT * 10 reward += deltas.SECRETCOUNT * 3000 # reward += deltas.HITCOUNT * 100 @@ -158,12 +158,15 @@ def _get_reward(self): # any ammo decrease should be ignored. if deltas.SELECTED_WEAPON != 0: # if we changed weapons, ignore ammo change reward, but give a nice reward - reward += 1000 + # reward += 1000 # NOTE: this is buggy,.. gives a large reward for shooting away all ammo + pass else: # decrement reward for firing a weapon, unless we hit or killed an enemy landed_shot = deltas.KILLCOUNT != 0 or deltas.HITCOUNT != 0 - if not landed_shot: - reward += deltas.SELECTED_WEAPON_AMMO * 30 + if landed_shot: + reward += 300 + else: + reward += deltas.SELECTED_WEAPON_AMMO * 10 # decrement reward for taking damage (already covered in HEALTH and ARMOR) # reward -= deltas.DAMAGE_TAKEN * 10 diff --git a/interactor.py b/interactor.py index dca5e8c..4aa6572 100644 --- a/interactor.py +++ b/interactor.py @@ -31,8 +31,6 @@ def __init__(self, num_envs: int, env_id: str): else: self.envs = [gymnasium.make(env_id) for _ in range(num_envs)] - self.dones = [False] * num_envs - # Pre-allocate observation and reward tensors first_obs_space = self.envs[0].observation_space['screen'] self.obs_shape = first_obs_space.shape @@ -44,7 +42,7 @@ def reset(self): for i in range(self.num_envs): obs, _ = self.envs[i].reset() self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor - self.dones[i] = False + self.dones_tensor[i] = False return self.observations def step(self, actions): @@ -54,25 +52,22 @@ def step(self, actions): all_infos = [] + self.dones_tensor[:] = False + for i in range(self.num_envs): - if self.dones[i]: + obs, reward, terminated, truncated, infos = self.envs[i].step(actions[i]) + self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor + self.rewards[i] = reward + done = terminated or truncated + self.dones_tensor[i] = done + + if done: # Reset the environment if it was done in the last step obs, infos = self.envs[i].reset() self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor self.rewards[i] = 0 # No reward on reset - self.dones_tensor[i] = False - self.dones[i] = False - - all_infos.append(infos) - else: - obs, reward, terminated, truncated, infos = self.envs[i].step(actions[i]) - self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor - self.rewards[i] = reward - done = terminated or truncated - self.dones_tensor[i] = done - self.dones[i] = done - all_infos.append(infos) + all_infos.append(infos) return self.observations, self.rewards, self.dones_tensor, all_infos @@ -101,8 +96,14 @@ def __init__(self, num_envs: int, watch: bool = False, env_id: str = "VizdoomCor cv2.namedWindow("screen", cv2.WINDOW_NORMAL) cv2.resizeWindow("screen", *DISPLAY_SIZE) + @property + def avg_rew_per_frame(self): + return self.current_episode_cumulative_rewards / (self.step_counter + 1) + def reset(self): self.current_episode_cumulative_rewards = torch.zeros(self.num_envs, dtype=torch.float32) + self.step_counter = torch.zeros(self.num_envs, dtype=torch.int32) + return self.env.reset() def step(self, actions=None): @@ -112,6 +113,7 @@ def step(self, actions=None): # Step the environments with the sampled actions observations, rewards, dones, infos = self.env.step(actions) self.current_episode_cumulative_rewards += rewards + self.step_counter += 1 # Show the screen from the 0th environment if watch is enabled if self.watch: @@ -123,15 +125,23 @@ def step(self, actions=None): cv2.putText(screen, f"Env: {self.watch_index}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) # also display the current reward - cv2.putText(screen, f"Ep Reward: {self.current_episode_cumulative_rewards[self.watch_index]}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) + cv2.putText(screen, f"Avg Reward per Frame: {self.avg_rew_per_frame[self.watch_index]:.4f}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.imshow("screen", screen) cv2.waitKey(1) # Display for 1 ms # reset the reward sums for the environments that are done for i in range(self.num_envs): + # if the average reward per frame falls below -1, let's reset + if self.avg_rew_per_frame[i] < -1: + new_obs, _ = self.env.envs[i].reset() + observations[i] = torch.tensor(new_obs["screen"], dtype=torch.uint8) + self.current_episode_cumulative_rewards[i] = 0 + self.step_counter[i] = 0 + if dones[i]: self.current_episode_cumulative_rewards[i] = 0 + self.step_counter[i] = 0 # Return the results return observations, rewards, dones, infos diff --git a/scenarios/freedom_custom.cfg b/scenarios/freedom_custom.cfg index fb02a7f..59d241f 100644 --- a/scenarios/freedom_custom.cfg +++ b/scenarios/freedom_custom.cfg @@ -12,8 +12,8 @@ render_decals = false render_particles = false window_visible = false -doom_scenario_path = freedoom2.wad -doom_map = map02 +# doom_scenario_path = freedoom2.wad +doom_map = map01 doom_skill=1 # Each step is good for you! diff --git a/train_doom.py b/train_doom.py index 03db2d4..2e4882c 100644 --- a/train_doom.py +++ b/train_doom.py @@ -200,10 +200,11 @@ def mini_cli(): GRID_SIZE = int(np.ceil(np.sqrt(NUM_ENVS))) # Dynamically determine the grid size # LR = 1e-4 # works well for corridor - LR = 1e-3 + LR = 3e-3 TRAIN_ON_CUMULATIVE_REWARDS = True NORM_WITH_REWARD_COUNTER = True + BATCH_NORM_SCORES = True # episode tracking (for video saving and replay) MAX_VIDEO_FRAMES = 1024 # will be clipped if a best episode is found to log to wandb @@ -228,7 +229,6 @@ def mini_cli(): observations = interactor.reset() cumulative_rewards_no_reset = torch.zeros((NUM_ENVS,)) - step_counters = torch.zeros((NUM_ENVS,), dtype=torch.float32) optimizer = torch.optim.Adam(agent.parameters(), lr=LR) @@ -291,17 +291,13 @@ def mini_cli(): episodic_rewards.append(interactor.current_episode_cumulative_rewards[i].item()) # TODO: criteria for best episode maybe should be most kills - if interactor.current_episode_cumulative_rewards[i].item() > best_episode_cumulative_reward: - best_episode_cumulative_reward = interactor.current_episode_cumulative_rewards[i].item() + if interactor.avg_rew_per_frame[i].item() > best_episode_cumulative_reward: + best_episode_cumulative_reward = interactor.avg_rew_per_frame[i].item() best_episode_env = i # Track which environment achieved the best reward best_episode = int(video_storage.episode_counters[i].item()) # Track the episode number episodic_rewards = torch.tensor(episodic_rewards) - # count the number of steps taken (reset if done) - step_counters += 1 - step_counters *= 1 - dones.float() - # call agent.reset with done flags for hidden state resetting agent.reset(dones) @@ -310,24 +306,28 @@ def mini_cli(): if TRAIN_ON_CUMULATIVE_REWARDS: # cumulative rewards if NORM_WITH_REWARD_COUNTER: - scores = interactor.current_episode_cumulative_rewards / step_counters + scores = interactor.avg_rew_per_frame else: scores = interactor.current_episode_cumulative_rewards else: # instantaneous rewards scores = rewards - norm_scores = (scores - scores.mean()) / (scores.std() + 1e-8) + if BATCH_NORM_SCORES: + scores = (scores - scores.mean()) / (scores.std() + 1e-8) # specifically symlog after normalizing scores # norm_scores = symlog_torch(norm_scores) - loss = (-log_probs * norm_scores.to(device)).mean() + loss = (-log_probs * scores.to(device)).mean() loss.backward() optimizer.step() print(f"------------- {step_i} -------------") print(f"Loss:\t\t{loss.item():.4f}") + # print(f"Norm Scores:\t{norm_scores.mean().item():.4f}") + # print(f"Scores:\t\t{scores.mean().item():.4f}") + print(f"Avg Rew Per Frame:\t{interactor.avg_rew_per_frame.mean().item():.4f}") print(f"Entropy:\t{entropy.mean().item():.4f}") print(f"Log Prob:\t{log_probs.mean().item():.4f}") print(f"Reward:\t\t{rewards.mean().item():.4f}")