Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions custom_doom.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def _get_reward(self):
# get deltas
deltas = self._current_reward_features.get_deltas(self._prev_reward_features)

reward += deltas.KILLCOUNT * 1000
reward += deltas.KILLCOUNT * 2000
reward += deltas.ITEMCOUNT * 10
reward += deltas.SECRETCOUNT * 3000
# reward += deltas.HITCOUNT * 100
Expand All @@ -158,12 +158,15 @@ def _get_reward(self):
# any ammo decrease should be ignored.
if deltas.SELECTED_WEAPON != 0:
# if we changed weapons, ignore ammo change reward, but give a nice reward
reward += 1000
# reward += 1000 # NOTE: this is buggy,.. gives a large reward for shooting away all ammo
pass
else:
# decrement reward for firing a weapon, unless we hit or killed an enemy
landed_shot = deltas.KILLCOUNT != 0 or deltas.HITCOUNT != 0
if not landed_shot:
reward += deltas.SELECTED_WEAPON_AMMO * 30
if landed_shot:
reward += 300
else:
reward += deltas.SELECTED_WEAPON_AMMO * 10

# decrement reward for taking damage (already covered in HEALTH and ARMOR)
# reward -= deltas.DAMAGE_TAKEN * 10
Expand Down
44 changes: 27 additions & 17 deletions interactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ def __init__(self, num_envs: int, env_id: str):
else:
self.envs = [gymnasium.make(env_id) for _ in range(num_envs)]

self.dones = [False] * num_envs

# Pre-allocate observation and reward tensors
first_obs_space = self.envs[0].observation_space['screen']
self.obs_shape = first_obs_space.shape
Expand All @@ -44,7 +42,7 @@ def reset(self):
for i in range(self.num_envs):
obs, _ = self.envs[i].reset()
self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor
self.dones[i] = False
self.dones_tensor[i] = False
return self.observations

def step(self, actions):
Expand All @@ -54,25 +52,22 @@ def step(self, actions):

all_infos = []

self.dones_tensor[:] = False

for i in range(self.num_envs):
if self.dones[i]:
obs, reward, terminated, truncated, infos = self.envs[i].step(actions[i])
self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor
self.rewards[i] = reward
done = terminated or truncated
self.dones_tensor[i] = done

if done:
# Reset the environment if it was done in the last step
obs, infos = self.envs[i].reset()
self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor
self.rewards[i] = 0 # No reward on reset
self.dones_tensor[i] = False
self.dones[i] = False

all_infos.append(infos)
else:
obs, reward, terminated, truncated, infos = self.envs[i].step(actions[i])
self.observations[i] = torch.tensor(obs["screen"], dtype=torch.uint8) # Fill the pre-allocated tensor
self.rewards[i] = reward
done = terminated or truncated
self.dones_tensor[i] = done
self.dones[i] = done

all_infos.append(infos)
all_infos.append(infos)

return self.observations, self.rewards, self.dones_tensor, all_infos

Expand Down Expand Up @@ -101,8 +96,14 @@ def __init__(self, num_envs: int, watch: bool = False, env_id: str = "VizdoomCor
cv2.namedWindow("screen", cv2.WINDOW_NORMAL)
cv2.resizeWindow("screen", *DISPLAY_SIZE)

@property
def avg_rew_per_frame(self):
return self.current_episode_cumulative_rewards / (self.step_counter + 1)

def reset(self):
self.current_episode_cumulative_rewards = torch.zeros(self.num_envs, dtype=torch.float32)
self.step_counter = torch.zeros(self.num_envs, dtype=torch.int32)

return self.env.reset()

def step(self, actions=None):
Expand All @@ -112,6 +113,7 @@ def step(self, actions=None):
# Step the environments with the sampled actions
observations, rewards, dones, infos = self.env.step(actions)
self.current_episode_cumulative_rewards += rewards
self.step_counter += 1

# Show the screen from the 0th environment if watch is enabled
if self.watch:
Expand All @@ -123,15 +125,23 @@ def step(self, actions=None):
cv2.putText(screen, f"Env: {self.watch_index}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

# also display the current reward
cv2.putText(screen, f"Ep Reward: {self.current_episode_cumulative_rewards[self.watch_index]}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
cv2.putText(screen, f"Avg Reward per Frame: {self.avg_rew_per_frame[self.watch_index]:.4f}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

cv2.imshow("screen", screen)
cv2.waitKey(1) # Display for 1 ms

# reset the reward sums for the environments that are done
for i in range(self.num_envs):
# if the average reward per frame falls below -1, let's reset
if self.avg_rew_per_frame[i] < -1:
new_obs, _ = self.env.envs[i].reset()
observations[i] = torch.tensor(new_obs["screen"], dtype=torch.uint8)
self.current_episode_cumulative_rewards[i] = 0
self.step_counter[i] = 0

if dones[i]:
self.current_episode_cumulative_rewards[i] = 0
self.step_counter[i] = 0

# Return the results
return observations, rewards, dones, infos
Expand Down
4 changes: 2 additions & 2 deletions scenarios/freedom_custom.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ render_decals = false
render_particles = false
window_visible = false

doom_scenario_path = freedoom2.wad
doom_map = map02
# doom_scenario_path = freedoom2.wad
doom_map = map01

doom_skill=1
# Each step is good for you!
Expand Down
22 changes: 11 additions & 11 deletions train_doom.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,11 @@ def mini_cli():
GRID_SIZE = int(np.ceil(np.sqrt(NUM_ENVS))) # Dynamically determine the grid size

# LR = 1e-4 # works well for corridor
LR = 1e-3
LR = 3e-3

TRAIN_ON_CUMULATIVE_REWARDS = True
NORM_WITH_REWARD_COUNTER = True
BATCH_NORM_SCORES = True

# episode tracking (for video saving and replay)
MAX_VIDEO_FRAMES = 1024 # will be clipped if a best episode is found to log to wandb
Expand All @@ -228,7 +229,6 @@ def mini_cli():
observations = interactor.reset()

cumulative_rewards_no_reset = torch.zeros((NUM_ENVS,))
step_counters = torch.zeros((NUM_ENVS,), dtype=torch.float32)

optimizer = torch.optim.Adam(agent.parameters(), lr=LR)

Expand Down Expand Up @@ -291,17 +291,13 @@ def mini_cli():
episodic_rewards.append(interactor.current_episode_cumulative_rewards[i].item())

# TODO: criteria for best episode maybe should be most kills
if interactor.current_episode_cumulative_rewards[i].item() > best_episode_cumulative_reward:
best_episode_cumulative_reward = interactor.current_episode_cumulative_rewards[i].item()
if interactor.avg_rew_per_frame[i].item() > best_episode_cumulative_reward:
best_episode_cumulative_reward = interactor.avg_rew_per_frame[i].item()
best_episode_env = i # Track which environment achieved the best reward
best_episode = int(video_storage.episode_counters[i].item()) # Track the episode number

episodic_rewards = torch.tensor(episodic_rewards)

# count the number of steps taken (reset if done)
step_counters += 1
step_counters *= 1 - dones.float()

# call agent.reset with done flags for hidden state resetting
agent.reset(dones)

Expand All @@ -310,24 +306,28 @@ def mini_cli():
if TRAIN_ON_CUMULATIVE_REWARDS:
# cumulative rewards
if NORM_WITH_REWARD_COUNTER:
scores = interactor.current_episode_cumulative_rewards / step_counters
scores = interactor.avg_rew_per_frame
else:
scores = interactor.current_episode_cumulative_rewards
else:
# instantaneous rewards
scores = rewards

norm_scores = (scores - scores.mean()) / (scores.std() + 1e-8)
if BATCH_NORM_SCORES:
scores = (scores - scores.mean()) / (scores.std() + 1e-8)

# specifically symlog after normalizing scores
# norm_scores = symlog_torch(norm_scores)
loss = (-log_probs * norm_scores.to(device)).mean()
loss = (-log_probs * scores.to(device)).mean()

loss.backward()
optimizer.step()

print(f"------------- {step_i} -------------")
print(f"Loss:\t\t{loss.item():.4f}")
# print(f"Norm Scores:\t{norm_scores.mean().item():.4f}")
# print(f"Scores:\t\t{scores.mean().item():.4f}")
print(f"Avg Rew Per Frame:\t{interactor.avg_rew_per_frame.mean().item():.4f}")
print(f"Entropy:\t{entropy.mean().item():.4f}")
print(f"Log Prob:\t{log_probs.mean().item():.4f}")
print(f"Reward:\t\t{rewards.mean().item():.4f}")
Expand Down