diff --git a/nasim/agents/bruteforce_agent.py b/nasim/agents/bruteforce_agent.py index caa5a377..76c8f2a3 100644 --- a/nasim/agents/bruteforce_agent.py +++ b/nasim/agents/bruteforce_agent.py @@ -72,7 +72,7 @@ def run_bruteforce_agent(env, step_limit=1e6, verbose=True): act = next(act_iter) cycle_complete = True - _, rew, done, env_step_limit_reached, _ = env.step(act) + _, rew, done, env_step_limit_reached, _ = env.step(int(act)) total_reward += rew if cycle_complete and verbose: diff --git a/nasim/agents/ql_replay_agent.py b/nasim/agents/ql_replay_agent.py index 13414bd3..29ccba28 100644 --- a/nasim/agents/ql_replay_agent.py +++ b/nasim/agents/ql_replay_agent.py @@ -247,10 +247,9 @@ def run_train_episode(self, step_limit): episode_return = 0 while not done and not env_step_limit_reached and steps < step_limit: - a = self.get_egreedy_action(o, self.get_epsilon()) - + a = self.get_egreedy_action(o[0], self.get_epsilon()) next_o, r, done, env_step_limit_reached, _ = self.env.step(a) - self.replay.store(o, a, next_o, r, done) + self.replay.store(o[0], a, next_o, r, done) self.steps_done += 1 mean_td_error, mean_v = self.optimize() self.logger.add_scalar( @@ -287,7 +286,7 @@ def run_eval_episode(self, input("Initial state. Press enter to continue..") while not done and not env_step_limit_reached: - a = self.get_egreedy_action(o, eval_epsilon) + a = self.get_egreedy_action(o[0], eval_epsilon) next_o, r, done, env_step_limit_reached, _ = env.step(a) o = next_o episode_return += r @@ -296,7 +295,7 @@ def run_eval_episode(self, print("\n" + line_break) print(f"Step {steps}") print(line_break) - print(f"Action Performed = {env.action_space.get_action(a)}") + print(f"Action Performed = {env.action_space.get_action(int(a))}") env.render(render_mode) print(f"Reward = {r}") print(f"Done = {done}") diff --git a/nasim/agents/random_agent.py b/nasim/agents/random_agent.py index e7fe7d1c..bc858946 100644 --- a/nasim/agents/random_agent.py +++ b/nasim/agents/random_agent.py @@ -35,7 +35,7 @@ def run_random_agent(env, step_limit=1e6, verbose=True): while not done and not env_step_limit_reached and t < step_limit: a = env.action_space.sample() - _, r, done, env_step_limit_reached, _ = env.step(a) + _, r, done, env_step_limit_reached, _ = env.step(int(a)) total_reward += r if (t+1) % 100 == 0 and verbose: print(f"{t}: {total_reward}")