From 8b05f8eeecff899b35871de8b45dcb538f020f45 Mon Sep 17 00:00:00 2001 From: Robert Kirk Date: Sun, 16 Jan 2022 14:17:30 +0000 Subject: [PATCH] Add evaluation logging Periodically log test scores. --- phasic_policy_gradient/envs.py | 13 +++++++----- phasic_policy_gradient/log_save_helper.py | 24 ++++++++++++++++++++++- phasic_policy_gradient/logger.py | 1 + phasic_policy_gradient/ppg.py | 2 ++ phasic_policy_gradient/ppo.py | 16 ++++++++++++++- phasic_policy_gradient/train.py | 12 +++++++++++- 6 files changed, 60 insertions(+), 8 deletions(-) diff --git a/phasic_policy_gradient/envs.py b/phasic_policy_gradient/envs.py index 0292a29..93abe77 100644 --- a/phasic_policy_gradient/envs.py +++ b/phasic_policy_gradient/envs.py @@ -2,11 +2,12 @@ import gym3 from procgen import ProcgenGym3Env -def get_procgen_venv(*, env_id, num_envs, rendering=False, **env_kwargs): +def get_procgen_venv(*, env_id, num_envs, distribution_mode, start_level, num_levels, rendering=False, **env_kwargs): if rendering: env_kwargs["render_human"] = True - - env = ProcgenGym3Env(num=num_envs, env_name=env_id, **env_kwargs) + env = ProcgenGym3Env(num=num_envs, env_name=env_id, \ + distribution_mode=distribution_mode, start_level=start_level, \ + num_levels=num_levels, **env_kwargs) env = gym3.ExtractDictObWrapper(env, "rgb") @@ -14,7 +15,9 @@ def get_procgen_venv(*, env_id, num_envs, rendering=False, **env_kwargs): env = gym3.ViewerWrapper(env, info_key="rgb") return env -def get_venv(num_envs, env_name, **env_kwargs): - venv = get_procgen_venv(num_envs=num_envs, env_id=env_name, **env_kwargs) +def get_venv(num_envs, env_name, distribution_mode, start_level, num_levels, **env_kwargs): + venv = get_procgen_venv(num_envs=num_envs, env_id=env_name, \ + distribution_mode=distribution_mode, start_level=start_level, \ + num_levels=num_levels, **env_kwargs) return venv \ No newline at end of file diff --git a/phasic_policy_gradient/log_save_helper.py b/phasic_policy_gradient/log_save_helper.py index af09fe3..386ddc8 100644 --- a/phasic_policy_gradient/log_save_helper.py +++ b/phasic_policy_gradient/log_save_helper.py @@ -53,6 +53,7 @@ def __init__( self.log_callbacks = log_callbacks self.log_new_eps = log_new_eps self.roller_stats = {} + self.eval_roller_stats = {} def __call__(self): self.total_interact_count += self.ic_per_step @@ -85,6 +86,26 @@ def gather_roller_stats(self, roller): } ) + def gather_eval_roller_stats(self, roller): + self.eval_roller_stats = { + "EpRewMeanTest": self._nanmean([] if roller is None else roller.recent_eprets), + "EpLenMeanTest": self._nanmean([] if roller is None else roller.recent_eplens), + } + if roller is not None and self.log_new_eps: + assert roller.has_non_rolling_eps, "roller needs keep_non_rolling" + ret_n, ret_mean, ret_std = self._nanmoments(roller.non_rolling_eprets) + _len_n, len_mean, len_std = self._nanmoments(roller.non_rolling_eplens) + roller.clear_non_rolling_episode_buf() + self.eval_roller_stats.update( + { + "NewEpNumTest": ret_n, + "NewEpRewMeanTest": ret_mean, + "NewEpRewStdTest": ret_std, + "NewEpLenMeanTest": len_mean, + "NewEpLenStdTest": len_std, + } + ) + def log(self): if self.log_callbacks is not None: for callback in self.log_callbacks: @@ -93,7 +114,8 @@ def log(self): for k, v in self.roller_stats.items(): logger.logkv(k, v) - logger.logkv("Misc/InteractCount", self.total_interact_count) + for k, v in self.eval_roller_stats.items(): + logger.logkv(k, v) cur_time = time.time() Δtime = cur_time - self.last_time Δic = self.total_interact_count - self.last_ic diff --git a/phasic_policy_gradient/logger.py b/phasic_policy_gradient/logger.py index 95d7f98..1648ee9 100644 --- a/phasic_policy_gradient/logger.py +++ b/phasic_policy_gradient/logger.py @@ -472,6 +472,7 @@ def configure( dir: "(str|None) Local directory to write to" = None, format_strs: "(str|None) list of formats" = None, comm: "(MPI communicator | None) average numerical stats over comm" = None, + suffix: "(str) suffix of the file to write to" = None, ): if dir is None: if os.getenv("OPENAI_LOGDIR"): diff --git a/phasic_policy_gradient/ppg.py b/phasic_policy_gradient/ppg.py index 69e951c..710c680 100644 --- a/phasic_policy_gradient/ppg.py +++ b/phasic_policy_gradient/ppg.py @@ -218,6 +218,7 @@ def learn( *, model, venv, + eval_venv, ppo_hps, aux_lr, aux_mbsize, @@ -245,6 +246,7 @@ def learn( # Policy phase ppo_state = ppo.learn( venv=venv, + eval_venv=eval_venv, model=model, learn_state=ppo_state, callbacks=[ diff --git a/phasic_policy_gradient/ppo.py b/phasic_policy_gradient/ppo.py index 807c681..7b0099d 100644 --- a/phasic_policy_gradient/ppo.py +++ b/phasic_policy_gradient/ppo.py @@ -114,7 +114,8 @@ def compute_losses( def learn( *, - venv: "(VecEnv) vectorized environment", + venv: "(VecEnv) vectorized train environment", + eval_venv: "(VecEnv) vectorized test environment", model: "(ppo.PpoModel)", interacts_total: "(float) total timesteps of interaction" = float("inf"), nstep: "(int) number of serial timesteps" = 256, @@ -199,6 +200,14 @@ def train_pi_and_vf(**arrays): keep_non_rolling=log_save_opts.get("log_new_eps", False), ) + eval_roller = learn_state.get("eval_roller") or Roller( + act_fn=model.act, + venv=eval_venv, + initial_state=model.initial_state(venv.num), + keep_buf=100, + keep_non_rolling=log_save_opts.get("log_new_eps", False), + ) + lsh = learn_state.get("lsh") or LogSaveHelper( ic_per_step=ic_per_step, model=model, comm=comm, **log_save_opts ) @@ -212,6 +221,10 @@ def train_pi_and_vf(**arrays): while curr_interact_count < interacts_total and not callback_exit: seg = roller.multi_step(nstep) lsh.gather_roller_stats(roller) + + eval_seg = eval_roller.multi_step(nstep) + lsh.gather_eval_roller_stats(eval_roller) + if rnorm: seg["reward"] = reward_normalizer(seg["reward"], seg["first"]) compute_advantage(model, seg, γ, λ, comm=comm) @@ -257,6 +270,7 @@ def train_pi_and_vf(**arrays): return dict( opts=opts, roller=roller, + eval_roller=eval_roller, lsh=lsh, reward_normalizer=reward_normalizer, curr_interact_count=curr_interact_count, diff --git a/phasic_policy_gradient/train.py b/phasic_policy_gradient/train.py index e32b3a1..5415b13 100644 --- a/phasic_policy_gradient/train.py +++ b/phasic_policy_gradient/train.py @@ -8,6 +8,8 @@ def train_fn(env_name="coinrun", distribution_mode="hard", + start_level=0, + num_levels=500, arch="dual", # 'shared', 'detach', or 'dual' # 'shared' = shared policy and value networks # 'dual' = separate policy and value networks @@ -38,7 +40,10 @@ def train_fn(env_name="coinrun", format_strs = ['csv', 'stdout'] if comm.Get_rank() == 0 else [] logger.configure(comm=comm, dir=log_dir, format_strs=format_strs) - venv = get_venv(num_envs=num_envs, env_name=env_name, distribution_mode=distribution_mode) + venv = get_venv(num_envs=num_envs, env_name=env_name, distribution_mode=distribution_mode, \ + start_level=start_level, num_levels=num_levels) + eval_venv = get_venv(num_envs=num_envs, env_name=env_name, distribution_mode=distribution_mode, \ + start_level=0, num_levels=0) enc_fn = lambda obtype: ImpalaEncoder( obtype.shape, @@ -55,6 +60,7 @@ def train_fn(env_name="coinrun", ppg.learn( venv=venv, + eval_venv=eval_venv, model=model, interacts_total=interacts_total, ppo_hps=dict( @@ -79,6 +85,8 @@ def train_fn(env_name="coinrun", def main(): parser = argparse.ArgumentParser(description='Process PPG training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') + parser.add_argument('--start_level', type=int, default=0) + parser.add_argument('--num_levels', type=int, default=200) parser.add_argument('--num_envs', type=int, default=64) parser.add_argument('--n_epoch_pi', type=int, default=1) parser.add_argument('--n_epoch_vf', type=int, default=1) @@ -94,6 +102,8 @@ def main(): train_fn( env_name=args.env_name, + start_level=args.start_level, + num_levels=args.num_levels, num_envs=args.num_envs, n_epoch_pi=args.n_epoch_pi, n_epoch_vf=args.n_epoch_vf,