From ac6cab1145fbe90c8ba9a5a307b96a3aaf41551f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 15:28:21 -0400 Subject: [PATCH 01/43] initial commit for off-policy (but its still PPO2) --- gym/envs/pendulum/pendulum_config.py | 12 +- learning/algorithms/__init__.py | 1 + learning/algorithms/sac.py | 168 ++++++++++++++++++++++++ learning/runners/__init__.py | 3 +- learning/runners/off_policy_runner.py | 180 ++++++++++++++++++++++++++ 5 files changed, 357 insertions(+), 7 deletions(-) create mode 100644 learning/algorithms/sac.py create mode 100644 learning/runners/off_policy_runner.py diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..7a27e2db 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -27,9 +27,9 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi, torch.pi], + "theta": [-torch.pi / 4, torch.pi / 4], } - dof_vel_range = {"theta": [-5, 5]} + dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "DataLoggingRunner" + runner_class_name = "OffPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -91,9 +91,9 @@ class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.05 dof_vel = 0.0 - torques = 0.025 + torques = 0.01 class termination_weight: termination = 0.0 @@ -121,5 +121,5 @@ class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "SAC" num_steps_per_env = 32 diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index ace1b9fe..78231181 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -33,3 +33,4 @@ from .ppo import PPO from .ppo2 import PPO2 from .SE import StateEstimator +from .sac import SAC \ No newline at end of file diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py new file mode 100644 index 00000000..61b59bd1 --- /dev/null +++ b/learning/algorithms/sac.py @@ -0,0 +1,168 @@ +import torch +import torch.nn as nn +import torch.optim as optim + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, +) + + +class SAC: + def __init__( + self, + actor, + critic, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + loss_fn="MSE", + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + self.actor = actor.to(self.device) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def test_mode(self): + self.actor.test() + self.critic.test() + + def switch_to_train(self): + self.actor.train() + self.critic.train() + + def act(self, obs, critic_obs): + return self.actor.act(obs).detach() + + def update(self, data, last_obs=None): + if last_obs is None: + last_values = None + else: + with torch.no_grad(): + last_values = self.critic.evaluate(last_obs).detach() + compute_generalized_advantages( + data, self.gamma, self.lam, self.critic, last_values + ) + + self.update_critic(data) + self.update_actor(data) + + def update_critic(self, data): + self.mean_value_loss = 0 + counter = 0 + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + value_batch = self.critic.evaluate(batch["critic_obs"]) + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data): + # already done before + # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) + self.mean_surrogate_loss = 0 + counter = 0 + + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + # ! refactor how this is done + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + + ( + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + for param_group in self.optimizer.param_groups: + # ! check this + param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) + ) + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_( + list(self.actor.parameters()) + list(self.critic.parameters()), + self.max_grad_norm, + ) + self.optimizer.step() + self.mean_surrogate_loss += surrogate_loss.item() + counter += 1 + self.mean_surrogate_loss /= counter diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index b0f49217..a0840b06 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -32,4 +32,5 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner -from .old_policy_runner import OldPolicyRunner \ No newline at end of file +from .old_policy_runner import OldPolicyRunner +from .off_policy_runner import OffPolicyRunner \ No newline at end of file diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py new file mode 100644 index 00000000..4a6af887 --- /dev/null +++ b/learning/runners/off_policy_runner.py @@ -0,0 +1,180 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class OffPolicyRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) From df6a655ee87e1afa080e8777c924a33d1b23c0ab Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 18:37:58 -0400 Subject: [PATCH 02/43] pendulum baseline --- gym/envs/base/legged_robot_config.py | 27 ++++++++++------- gym/envs/mini_cheetah/mini_cheetah_config.py | 15 +--------- .../mini_cheetah/mini_cheetah_ref_config.py | 20 ++----------- gym/envs/pendulum/pendulum_config.py | 3 +- learning/algorithms/ppo2.py | 29 +++++++++---------- learning/utils/dict_utils.py | 11 ++++++- 6 files changed, 44 insertions(+), 61 deletions(-) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7e6cabc6..f932ecd2 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -288,25 +288,30 @@ class termination_weight: termination = 0.01 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + mini_batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" - num_steps_per_env = 24 + num_steps_per_env = 24 # deprecate max_iterations = 1500 save_interval = 50 run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 28c70123..37c600a2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -190,20 +190,7 @@ class termination_weight: termination = 0.01 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed - discount_horizon = 1.0 # [s] - # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(LeggedRobotRunnerCfg.runner): run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 3c92882d..b8e167eb 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -138,27 +138,11 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True # deprecate for PPO2 - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 6 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 4 - storage_size = 2**17 # new - mini_batch_size = 2**15 # new - learning_rate = 5.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 # [s] - lam = 0.95 - GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 32 # deprecate diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 7a27e2db..43f37a35 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -84,7 +84,6 @@ class critic: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - standard_critic_nn = True class reward: class weights: @@ -107,7 +106,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): num_learning_epochs = 6 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 1.0e-3 + learning_rate = 1.0e-4 schedule = "fixed" # could be adaptive, fixed discount_horizon = 2.0 # [s] lam = 0.98 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..1463468a 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -13,8 +13,8 @@ def __init__( self, actor, critic, - num_learning_epochs=1, - num_mini_batches=1, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -42,8 +42,8 @@ def __init__( # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam @@ -78,10 +78,11 @@ def update_critic(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: value_batch = self.critic.evaluate(batch["critic_obs"]) value_loss = self.critic.loss_fn(value_batch, batch["returns"]) @@ -94,8 +95,6 @@ def update_critic(self, data): self.mean_value_loss /= counter def update_actor(self, data): - # already done before - # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) self.mean_surrogate_loss = 0 counter = 0 @@ -106,12 +105,12 @@ def update_actor(self, data): data["actions"] ).detach() - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - # ! refactor how this is done self.actor.act(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 2d19e072..eaf7b9db 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -60,10 +60,19 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): # todo change num_epochs to num_batches @torch.no_grad -def create_uniform_generator(data, batch_size, num_epochs, keys=None): +def create_uniform_generator( + data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None +): n, m = data.shape total_data = n * m + + if batch_size > total_data: + batch_size = total_data + num_batches_per_epoch = total_data // batch_size + if max_gradient_steps: + num_epochs = max_gradient_steps // num_batches_per_epoch + num_epochs = max(num_epochs, 1) for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): From 0a20324df89fcd5b7bf5d90afced862134894ce0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 10:20:06 -0400 Subject: [PATCH 03/43] quick fix on storage config --- gym/envs/base/fixed_robot_config.py | 25 ++++++++++++-------- gym/envs/base/legged_robot_config.py | 2 +- gym/envs/pendulum/pendulum_config.py | 34 +++++++++++++++------------- learning/storage/SE_storage.py | 8 +++---- learning/storage/rollout_storage.py | 8 +++---- 5 files changed, 42 insertions(+), 35 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 8c7427f8..fbecced6 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -165,20 +165,25 @@ class termination_weight: termination = 0.0 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index f932ecd2..43b09e68 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -296,7 +296,7 @@ class algorithm: max_grad_steps = 10 # new storage_size = 2**17 # new - mini_batch_size = 2**15 # new + batch_size = 2**15 # new clip_param = 0.2 learning_rate = 1.0e-3 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 43f37a35..4ad6c879 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OffPolicyRunner" + runner_class_name = "OnPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -98,27 +98,29 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor entropy_coef = 0.01 - num_learning_epochs = 6 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-4 - schedule = "fixed" # could be adaptive, fixed - discount_horizon = 2.0 # [s] - lam = 0.98 - # GAE_bootstrap_horizon = .0 # [s] + schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 - max_grad_norm = 1.0 - standard_loss = True - plus_c_penalty = 0.1 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "SAC" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/learning/storage/SE_storage.py b/learning/storage/SE_storage.py index a63f5f9f..0f4eaf41 100644 --- a/learning/storage/SE_storage.py +++ b/learning/storage/SE_storage.py @@ -53,9 +53,9 @@ def clear(self): def mini_batch_generator(self, num_mini_batches, num_epochs=8): """Generate mini batch for learning""" batch_size = self.num_envs * self.num_transitions_per_env - mini_batch_size = batch_size // num_mini_batches + batch_size = batch_size // num_mini_batches indices = torch.randperm( - num_mini_batches * mini_batch_size, + num_mini_batches * batch_size, requires_grad=False, device=self.device, ) @@ -69,8 +69,8 @@ def mini_batch_generator(self, num_mini_batches, num_epochs=8): for epoch in range(num_epochs): for i in range(num_mini_batches): - start = i * mini_batch_size - end = (i + 1) * mini_batch_size + start = i * batch_size + end = (i + 1) * batch_size batch_idx = indices[start:end] obs_batch = observations[batch_idx] diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py index 38a315d7..0f2885a3 100644 --- a/learning/storage/rollout_storage.py +++ b/learning/storage/rollout_storage.py @@ -184,9 +184,9 @@ def get_statistics(self): def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): batch_size = self.num_envs * self.num_transitions_per_env - mini_batch_size = batch_size // num_mini_batches + batch_size = batch_size // num_mini_batches indices = torch.randperm( - num_mini_batches * mini_batch_size, + num_mini_batches * batch_size, requires_grad=False, device=self.device, ) @@ -207,8 +207,8 @@ def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): for epoch in range(num_epochs): for i in range(num_mini_batches): - start = i * mini_batch_size - end = (i + 1) * mini_batch_size + start = i * batch_size + end = (i + 1) * batch_size batch_idx = indices[start:end] obs_batch = observations[batch_idx] From 6fd4ba960ecbfd89d20424bc0ae889597947d335 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 10:45:47 -0400 Subject: [PATCH 04/43] refactor GAE computation to be explicit --- learning/algorithms/ppo2.py | 5 ++++- learning/utils/dict_utils.py | 34 +++++++++++++--------------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 1463468a..08c2f50d 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -67,9 +67,12 @@ def update(self, data, last_obs=None): else: with torch.no_grad(): last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( + + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( data, self.gamma, self.lam, self.critic, last_values ) + data["returns"] = data["advantages"] + data["values"] self.update_critic(data) self.update_actor(data) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index eaf7b9db..d06250d4 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -9,30 +9,28 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): else: last_values = critic.evaluate(data["critic_obs"][-1]) - data.update({"returns": torch.zeros_like(data["rewards"])}) - data["returns"][-1] = last_values * ~data["dones"][-1] + returns = torch.zeros_like(data["rewards"]) + returns[-1] = last_values * ~data["dones"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] - data["returns"][k] = ( - data["rewards"][k] + gamma * data["returns"][k + 1] * not_done - ) - data["returns"] = (data["returns"] - data["returns"].mean()) / ( - data["returns"].std() + 1e-8 - ) - return + returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done + + return normalize(returns) @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - data.update({"values": critic.evaluate(data["critic_obs"])}) +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) - data.update({"advantages": torch.zeros_like(data["values"])}) +@torch.no_grad +def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] - data["advantages"][-1] = ( + advantages[-1] = ( data["rewards"][-1] + gamma * data["values"][-1] * data["timed_out"][-1] + gamma * last_values * not_done @@ -47,15 +45,9 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + gamma * data["values"][k + 1] * not_done - data["values"][k] ) - data["advantages"][k] = ( - td_error + gamma * lam * not_done * data["advantages"][k + 1] - ) - - data["returns"] = data["advantages"] + data["values"] + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( - data["advantages"].std() + 1e-8 - ) + return normalize(advantages) # todo change num_epochs to num_batches From 890d56246d6cf8da36d6f5e6c10aee9afe2b040f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 16:54:16 -0400 Subject: [PATCH 05/43] set up off-policy, and SAC value function update. --- gym/envs/pendulum/pendulum_config.py | 6 +- learning/algorithms/ppo2.py | 5 +- learning/algorithms/sac.py | 111 +++++++++++++++++++------- learning/runners/off_policy_runner.py | 24 +++++- 4 files changed, 105 insertions(+), 41 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 4ad6c879..5e2e0e01 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "OffPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -114,7 +114,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): # Critic use_clipped_value_loss = True # Actor - entropy_coef = 0.01 + entropy_coef = 0.1 schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 @@ -122,5 +122,5 @@ class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "SAC" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 08c2f50d..9d80655c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -160,10 +160,7 @@ def update_actor(self, data): # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), - self.max_grad_norm, - ) + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += surrogate_loss.item() counter += 1 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 61b59bd1..2565b735 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -12,9 +12,10 @@ class SAC: def __init__( self, actor, - critic, - num_learning_epochs=1, - num_mini_batches=1, + critic_1, + critic_2, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -37,26 +38,41 @@ def __init__( # * PPO components self.actor = actor.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) - self.critic = critic.to(self.device) - self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + self.critic_1 = critic_1.to(self.device) + self.critic_2 = critic_2.to(self.device) + self.critic_optimizer = optim.Adam( + list(self.critic_1.parameters()) + list(self.critic_2.parameters()), + lr=learning_rate, + ) # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss + # * SAC parameters + self.learning_starts = 100 + self.batch_size = batch_size + self.tau = 0.005 + self.gamma = 0.99 + self.ent_coef = "fixed" + self.target_entropy = "fixed" + self.mean_surrogate_loss = 0 + self.mean_value_loss = 0 def test_mode(self): self.actor.test() - self.critic.test() + self.critic_1.test() + self.critic_2.test() def switch_to_train(self): self.actor.train() - self.critic.train() + self.critic_1.train() + self.critic_2.train() def act(self, obs, critic_obs): return self.actor.act(obs).detach() @@ -64,30 +80,67 @@ def act(self, obs, critic_obs): def update(self, data, last_obs=None): if last_obs is None: last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values - ) + # else: + # todo + # with torch.no_grad(): + # last_values_1 = self.critic_1.evaluate(last_obs).detach() + # last_values_2 = self.critic_2.evaluate(last_obs).detach() + # last_values = torch.min(last_values_1, last_values_2) + + # data["values"] = self.critic.evaluate(data["critic_obs"]) + # data["advantages"] = compute_generalized_advantages( + # data, self.gamma, self.lam, self.critic, last_values + # ) + # data["returns"] = data["advantages"] + data["values"] self.update_critic(data) + + # todo properly take min over here too? but might not be needed for SAC anyway + obs_actions = torch.cat((data["critic_obs"], data["actions"]), dim=-1) + data["values"] = self.critic_1.evaluate(obs_actions) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic_1, last_values + ) + data["returns"] = data["advantages"] + data["values"] self.update_actor(data) def update_critic(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) - value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + actions = self.actor.act(batch["actor_obs"]) + actions_log_prob = self.actor.get_actions_log_prob(actions) + + obs_action_batch = torch.cat((batch["critic_obs"], actions), dim=-1) + qvalue_1 = self.critic_1.evaluate(obs_action_batch) + qvalue_2 = self.critic_2.evaluate(obs_action_batch) + qvalue_min = torch.min(qvalue_1, qvalue_2) + + targets = ( + self.gamma + * batch["dones"].logical_not() + * (qvalue_min - self.entropy_coef * actions_log_prob) + ) + batch["rewards"] + + value_loss = self.critic_1.loss_fn( + qvalue_1, targets + ) + self.critic_2.loss_fn(qvalue_2, targets) + ##### + # value_batch = self.critic.evaluate(batch["critic_obs"]) + # value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + ##### self.critic_optimizer.zero_grad() value_loss.backward() - nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + nn.utils.clip_grad_norm_( + list(self.critic_1.parameters()) + list(self.critic_2.parameters()), + self.max_grad_norm, + ) self.critic_optimizer.step() self.mean_value_loss += value_loss.item() counter += 1 @@ -106,10 +159,11 @@ def update_actor(self, data): data["actions"] ).detach() - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: # ! refactor how this is done self.actor.act(batch["actor_obs"]) @@ -158,10 +212,7 @@ def update_actor(self, data): # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), - self.max_grad_norm, - ) + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += surrogate_loss.item() counter += 1 diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 4a6af887..81963022 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -5,7 +5,9 @@ from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.modules import Actor, Critic from learning.storage import DictStorage +from learning.algorithms import SAC logger = Logger() storage = DictStorage() @@ -21,6 +23,15 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + self.alg = SAC(actor, critic_1, critic_2, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() @@ -141,7 +152,9 @@ def set_up_logger(self): ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + logger.attach_torch_obj_to_wandb( + (self.alg.actor, self.alg.critic_1, self.alg.critic_2) + ) def save(self): os.makedirs(self.log_dir, exist_ok=True) @@ -149,7 +162,8 @@ def save(self): torch.save( { "actor_state_dict": self.alg.actor.state_dict(), - "critic_state_dict": self.alg.critic.state_dict(), + "critic_1_state_dict": self.alg.critic_1.state_dict(), + "critic_2_state_dict": self.alg.critic_2.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, @@ -160,7 +174,8 @@ def save(self): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"]) + self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.alg.critic_optimizer.load_state_dict( @@ -170,7 +185,8 @@ def load(self, path, load_optimizer=True): def switch_to_eval(self): self.alg.actor.eval() - self.alg.critic.eval() + self.alg.critic_1.eval() + self.alg.critic_2.eval() def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) From 200a14733fa167b95d3e38c4c4937860faf8c096 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 17:37:02 -0400 Subject: [PATCH 06/43] store next_obs explicitly, use for last_values Comment: last_values doesn't seem to have much of an impact on performance at all. However, for off-policy algorithms, having access to the next state (obs) explicitly starts to be a lot handier. Alternative is to do some data juggling to pull it out, which is a bit tedious and probably needs to be stored in a tensor anyway to allow the batch generation --- learning/algorithms/ppo2.py | 10 ++-------- learning/runners/on_policy_runner.py | 4 ++++ learning/utils/dict_utils.py | 4 +++- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 9d80655c..6462fdba 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -61,16 +61,10 @@ def switch_to_train(self): def act(self, obs, critic_obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - + def update(self, data): data["values"] = self.critic.evaluate(data["critic_obs"]) data["advantages"] = compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values + data, self.gamma, self.lam, self.critic ) data["returns"] = data["advantages"] + data["values"] diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..6a4a4265 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,8 +37,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -89,6 +91,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index d06250d4..3b102338 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -24,7 +24,8 @@ def normalize(input, eps=1e-8): @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this @@ -65,6 +66,7 @@ def create_uniform_generator( if max_gradient_steps: num_epochs = max_gradient_steps // num_batches_per_epoch num_epochs = max(num_epochs, 1) + for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): From 96d5abb548e3d1cd7307415ff377f074311e23be Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 18:43:58 -0400 Subject: [PATCH 07/43] refactor critic loss function to consume obs and self-evaluate --- learning/algorithms/ppo2.py | 4 +--- learning/modules/critic.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 6462fdba..ed796400 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -24,7 +24,6 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, - loss_fn="MSE", device="cpu", **kwargs, ): @@ -81,8 +80,7 @@ def update_critic(self, data): max_gradient_steps=self.max_gradient_steps, ) for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) - value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 732f8eda..8ef5395d 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -26,5 +26,5 @@ def evaluate(self, critic_observations): critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() - def loss_fn(self, input, target): - return nn.functional.mse_loss(input, target, reduction="mean") + def loss_fn(self, obs, target): + return nn.functional.mse_loss(self.evaluate(obs), target, reduction="mean") From bf056128a0cd343bb5d2d77e91e6ec210767f4b8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 27 Mar 2024 09:30:17 -0400 Subject: [PATCH 08/43] WIP: should be SAC now, with a lot of rough edges, and missing the tanh squashing --- learning/algorithms/sac.py | 63 +++++++++++---------------- learning/runners/off_policy_runner.py | 4 ++ 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 2565b735..d0ea3906 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -4,7 +4,6 @@ from learning.utils import ( create_uniform_generator, - compute_generalized_advantages, ) @@ -77,35 +76,9 @@ def switch_to_train(self): def act(self, obs, critic_obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - # else: - # todo - # with torch.no_grad(): - # last_values_1 = self.critic_1.evaluate(last_obs).detach() - # last_values_2 = self.critic_2.evaluate(last_obs).detach() - # last_values = torch.min(last_values_1, last_values_2) - - # data["values"] = self.critic.evaluate(data["critic_obs"]) - # data["advantages"] = compute_generalized_advantages( - # data, self.gamma, self.lam, self.critic, last_values - # ) - # data["returns"] = data["advantages"] + data["values"] - - self.update_critic(data) - - # todo properly take min over here too? but might not be needed for SAC anyway - obs_actions = torch.cat((data["critic_obs"], data["actions"]), dim=-1) - data["values"] = self.critic_1.evaluate(obs_actions) - data["advantages"] = compute_generalized_advantages( - data, self.gamma, self.lam, self.critic_1, last_values - ) - data["returns"] = data["advantages"] + data["values"] - self.update_actor(data) - - def update_critic(self, data): + def update(self, data): self.mean_value_loss = 0 + self.mean_surrogate_loss = 0 counter = 0 generator = create_uniform_generator( @@ -114,23 +87,24 @@ def update_critic(self, data): max_gradient_steps=self.max_gradient_steps, ) for batch in generator: - actions = self.actor.act(batch["actor_obs"]) - actions_log_prob = self.actor.get_actions_log_prob(actions) + next_action = self.actor.act(batch["next_actor_obs"]) + next_actions_log_prob = self.actor.get_actions_log_prob(next_action) - obs_action_batch = torch.cat((batch["critic_obs"], actions), dim=-1) - qvalue_1 = self.critic_1.evaluate(obs_action_batch) - qvalue_2 = self.critic_2.evaluate(obs_action_batch) + next_obs_action = torch.cat((batch["next_critic_obs"], next_action), dim=-1) + qvalue_1 = self.critic_1.evaluate(next_obs_action) + qvalue_2 = self.critic_2.evaluate(next_obs_action) qvalue_min = torch.min(qvalue_1, qvalue_2) targets = ( self.gamma * batch["dones"].logical_not() - * (qvalue_min - self.entropy_coef * actions_log_prob) + * (qvalue_min - self.entropy_coef * next_actions_log_prob) ) + batch["rewards"] + obs_action = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) value_loss = self.critic_1.loss_fn( - qvalue_1, targets - ) + self.critic_2.loss_fn(qvalue_2, targets) + obs_action, targets + ) + self.critic_2.loss_fn(obs_action, targets) ##### # value_batch = self.critic.evaluate(batch["critic_obs"]) # value_loss = self.critic.loss_fn(value_batch, batch["returns"]) @@ -144,6 +118,21 @@ def update_critic(self, data): self.critic_optimizer.step() self.mean_value_loss += value_loss.item() counter += 1 + + action = self.actor.act(batch["actor_obs"]) + actions_log_prob = self.actor.get_actions_log_prob(action) + obs_action = torch.cat((batch["critic_obs"], action), dim=-1) + qvalue_1 = self.critic_1.evaluate(obs_action) + qvalue_2 = self.critic_2.evaluate(obs_action) + qvalue_min = torch.min(qvalue_1, qvalue_2) + # update actor + actor_loss = (qvalue_min - self.entropy_coef * actions_log_prob).mean() + self.optimizer.zero_grad() + actor_loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.optimizer.step() + self.mean_surrogate_loss += actor_loss.item() + self.mean_value_loss /= counter def update_actor(self, data): diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 81963022..e70971ed 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -48,8 +48,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -100,6 +102,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, From 692502c8f5fd12909b572cdacc646afdcbe5c368 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 27 Mar 2024 13:30:11 -0400 Subject: [PATCH 09/43] zap useless code --- learning/algorithms/ppo.py | 3 --- learning/algorithms/ppo2.py | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 92898ad5..befeea79 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -105,9 +105,6 @@ def init_storage( self.device, ) - def test_mode(self): - self.actor_critic.test() - def train_mode(self): self.actor_critic.train() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index ed796400..3e5f454d 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -49,15 +49,11 @@ def __init__( self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - def test_mode(self): - self.actor.test() - self.critic.test() - def switch_to_train(self): self.actor.train() self.critic.train() - def act(self, obs, critic_obs): + def act(self, obs): return self.actor.act(obs).detach() def update(self, data): From 5c7d8984d00ccea8bf088e63c3dc1e3adc07dd7c Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 28 Mar 2024 13:50:43 -0400 Subject: [PATCH 10/43] WIP: SAC implementation (based off rsl_rl) runs Does not train, need to tune. --- .../mini_cheetah/mini_cheetah_osc_config.py | 1 + learning/algorithms/sac.py | 372 ++++++++++-------- learning/modules/__init__.py | 1 + learning/modules/chimera_actor.py | 76 ++++ learning/modules/critic.py | 11 +- learning/runners/off_policy_runner.py | 71 ++-- learning/runners/on_policy_runner.py | 4 +- learning/utils/__init__.py | 2 +- learning/utils/utils.py | 6 + 9 files changed, 356 insertions(+), 188 deletions(-) create mode 100644 learning/modules/chimera_actor.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a4acefa6..5aaafde3 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -160,6 +160,7 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" class policy: hidden_dims = [256, 256, 128] diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index d0ea3906..52be75ed 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -2,9 +2,7 @@ import torch.nn as nn import torch.optim as optim -from learning.utils import ( - create_uniform_generator, -) +from learning.utils import create_uniform_generator, polyak_update class SAC: @@ -13,196 +11,252 @@ def __init__( actor, critic_1, critic_2, + target_critic_1, + target_critic_2, batch_size=2**15, max_gradient_steps=10, - clip_param=0.2, - gamma=0.998, - lam=0.95, - entropy_coef=0.0, - learning_rate=1e-3, + action_max=10.0, + action_min=-10.0, + actor_noise_std=1.0, + log_std_max=4.0, + log_std_min=-20.0, + alpha=0.2, + alpha_lr=1e-4, + target_entropy=None, max_grad_norm=1.0, - use_clipped_value_loss=True, - schedule="fixed", - desired_kl=0.01, - loss_fn="MSE", + polyak=0.995, + gamma=0.99, + # clip_param=0.2, # * PPO + # gamma=0.998, + # lam=0.95, + # entropy_coef=0.0, + actor_lr=1e-4, + critic_lr=1e-4, + # max_grad_norm=1.0, + # use_clipped_value_loss=True, + # schedule="fixed", + # desired_kl=0.01, device="cpu", **kwargs, ): self.device = device - self.desired_kl = desired_kl - self.schedule = schedule - self.learning_rate = learning_rate + # self.desired_kl = desired_kl + # self.schedule = schedule + # self.lr = lr # * PPO components self.actor = actor.to(self.device) - self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_1 = critic_1.to(self.device) self.critic_2 = critic_2.to(self.device) - self.critic_optimizer = optim.Adam( - list(self.critic_1.parameters()) + list(self.critic_2.parameters()), - lr=learning_rate, + self.target_critic_1 = target_critic_1.to(self.device) + self.target_critic_2 = target_critic_2.to(self.device) + self.target_critic_1.load_state_dict(self.critic_1.state_dict()) + self.target_critic_2.load_state_dict(self.critic_2.state_dict()) + + self.log_alpha = torch.log(torch.tensor(alpha)).requires_grad_() + + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) + self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr) + self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr) + + # this is probably something to put into the neural network + self.action_max = action_max + self.action_min = action_min + + self.action_delta = (self.action_max - self.action_min) / 2.0 + self.action_offset = (self.action_max + self.action_min) / 2.0 + + self.max_grad_norm = max_grad_norm + self.target_entropy = ( + target_entropy if target_entropy else -self.actor.num_actions ) # * PPO parameters - self.clip_param = clip_param - self.batch_size = batch_size + # self.clip_param = clip_param + # self.batch_size = batch_size self.max_gradient_steps = max_gradient_steps - self.entropy_coef = entropy_coef - self.gamma = gamma - self.lam = lam - self.max_grad_norm = max_grad_norm - self.use_clipped_value_loss = use_clipped_value_loss + # self.entropy_coef = entropy_coef + # self.gamma = gamma + # self.lam = lam + # self.max_grad_norm = max_grad_norm + # self.use_clipped_value_loss = use_clipped_value_loss # * SAC parameters self.learning_starts = 100 self.batch_size = batch_size - self.tau = 0.005 - self.gamma = 0.99 - self.ent_coef = "fixed" - self.target_entropy = "fixed" - self.mean_surrogate_loss = 0 - self.mean_value_loss = 0 - - def test_mode(self): - self.actor.test() - self.critic_1.test() - self.critic_2.test() + self.polyak = polyak + self.gamma = gamma + # self.ent_coef = "fixed" + # self.target_entropy = "fixed" + + @property + def alpha(self): + return self.log_alpha.exp() def switch_to_train(self): self.actor.train() self.critic_1.train() self.critic_2.train() + self.target_critic_1.train() + self.target_critic_2.train() - def act(self, obs, critic_obs): - return self.actor.act(obs).detach() + def switch_to_eval(self): + self.actor.eval() + self.critic_1.eval() + self.critic_2.eval() + self.target_critic_1.eval() + self.target_critic_2.eval() - def update(self, data): - self.mean_value_loss = 0 - self.mean_surrogate_loss = 0 - counter = 0 + def act(self, obs): + mean, std = self.actor.forward(obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() - generator = create_uniform_generator( - data, - self.batch_size, - max_gradient_steps=self.max_gradient_steps, - ) - for batch in generator: - next_action = self.actor.act(batch["next_actor_obs"]) - next_actions_log_prob = self.actor.get_actions_log_prob(next_action) - - next_obs_action = torch.cat((batch["next_critic_obs"], next_action), dim=-1) - qvalue_1 = self.critic_1.evaluate(next_obs_action) - qvalue_2 = self.critic_2.evaluate(next_obs_action) - qvalue_min = torch.min(qvalue_1, qvalue_2) - - targets = ( - self.gamma - * batch["dones"].logical_not() - * (qvalue_min - self.entropy_coef * next_actions_log_prob) - ) + batch["rewards"] - - obs_action = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) - value_loss = self.critic_1.loss_fn( - obs_action, targets - ) + self.critic_2.loss_fn(obs_action, targets) - ##### - # value_batch = self.critic.evaluate(batch["critic_obs"]) - # value_loss = self.critic.loss_fn(value_batch, batch["returns"]) - ##### - self.critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm_( - list(self.critic_1.parameters()) + list(self.critic_2.parameters()), - self.max_grad_norm, - ) - self.critic_optimizer.step() - self.mean_value_loss += value_loss.item() - counter += 1 - - action = self.actor.act(batch["actor_obs"]) - actions_log_prob = self.actor.get_actions_log_prob(action) - obs_action = torch.cat((batch["critic_obs"], action), dim=-1) - qvalue_1 = self.critic_1.evaluate(obs_action) - qvalue_2 = self.critic_2.evaluate(obs_action) - qvalue_min = torch.min(qvalue_1, qvalue_2) - # update actor - actor_loss = (qvalue_min - self.entropy_coef * actions_log_prob).mean() - self.optimizer.zero_grad() - actor_loss.backward() - nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) - self.optimizer.step() - self.mean_surrogate_loss += actor_loss.item() - - self.mean_value_loss /= counter - - def update_actor(self, data): - # already done before - # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) - self.mean_surrogate_loss = 0 - counter = 0 - - self.actor.act(data["actor_obs"]) - data["old_sigma_batch"] = self.actor.action_std.detach() - data["old_mu_batch"] = self.actor.action_mean.detach() - data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( - data["actions"] - ).detach() + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + return actions_scaled + def update(self, data): generator = create_uniform_generator( data, self.batch_size, max_gradient_steps=self.max_gradient_steps, ) + + count = 0 + self.mean_actor_loss = 0 + self.mean_alpha_loss = 0 + self.mean_critic_1_loss = 0 + self.mean_critic_2_loss = 0 + for batch in generator: - # ! refactor how this is done - self.actor.act(batch["actor_obs"]) - actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) - mu_batch = self.actor.action_mean - sigma_batch = self.actor.action_std - entropy_batch = self.actor.entropy - - # * KL - if self.desired_kl is not None and self.schedule == "adaptive": - with torch.inference_mode(): - kl = torch.sum( - torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) - + ( - torch.square(batch["old_sigma_batch"]) - + torch.square(batch["old_mu_batch"] - mu_batch) - ) - / (2.0 * torch.square(sigma_batch)) - - 0.5, - axis=-1, - ) - kl_mean = torch.mean(kl) - - if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) - elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) - - for param_group in self.optimizer.param_groups: - # ! check this - param_group["lr"] = self.learning_rate - - # * Surrogate loss - ratio = torch.exp( - actions_log_prob_batch - - torch.squeeze(batch["old_actions_log_prob_batch"]) + self.update_critic(batch) + self.update_actor_and_alpha(batch) + + # Update Target Networks + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak ) - surrogate = -torch.squeeze(batch["advantages"]) * ratio - surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( - ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak ) - surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() - - loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() - - # * Gradient step - self.optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) - self.optimizer.step() - self.mean_surrogate_loss += surrogate_loss.item() - counter += 1 - self.mean_surrogate_loss /= counter + count += 1 + self.mean_actor_loss /= count + self.mean_alpha_loss /= count + self.mean_critic_1_loss /= count + self.mean_critic_2_loss /= count + + return None + + def update_critic(self, batch): + critic_obs = batch["critic_obs"] + actions = batch["actions"] + rewards = batch["rewards"] + next_actor_obs = batch["next_actor_obs"] + next_critic_obs = batch["next_critic_obs"] + dones = batch["dones"] + + with torch.no_grad(): + # * self._sample_action(actor_next_obs) + mean, std = self.actor.forward(next_actor_obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() + + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + ## * + action_logp = distribution.log_prob(actions).sum(-1) - torch.log( + 1.0 - actions_normalized.pow(2) + 1e-6 + ).sum(-1) + + # * returns target_action = actions_scaled, target_action_logp = action_logp + target_action = actions_scaled + target_action_logp = action_logp + + # * self._critic_input + # ! def should put the action computation into the actor + target_critic_in = torch.cat((next_critic_obs, target_action), dim=-1) + target_critic_prediction_1 = self.target_critic_1.forward(target_critic_in) + target_critic_prediction_2 = self.target_critic_2.forward(target_critic_in) + + target_next = ( + torch.min(target_critic_prediction_1, target_critic_prediction_2) + - self.alpha * target_action_logp + ) + target = rewards + self.gamma * dones.logical_not() * target_next + + critic_in = torch.cat((critic_obs, actions), dim=-1) + + # critic_prediction_1 = self.critic_1.forward(critic_in) + critic_loss_1 = self.critic_1.loss_fn(critic_in, target) + self.critic_1_optimizer.zero_grad() + critic_loss_1.backward() + nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) + self.critic_1_optimizer.step() + + # critic_prediction_2 = self.critic_2.forward(critic_in) + critic_loss_2 = self.critic_2.loss_fn(critic_in, target) + self.critic_2_optimizer.zero_grad() + critic_loss_2.backward() + nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) + self.critic_2_optimizer.step() + + self.mean_critic_1_loss += critic_loss_1.item() + self.mean_critic_2_loss += critic_loss_2.item() + + return + + def update_actor_and_alpha(self, batch): + actor_obs = batch["actor_obs"] + critic_obs = batch["critic_obs"] + + mean, std = self.actor.forward(actor_obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() + + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + ## * + action_logp = distribution.log_prob(actions).sum(-1) - torch.log( + 1.0 - actions_normalized.pow(2) + 1e-6 + ).sum(-1) + + # * returns target_action = actions_scaled, target_action_logp = action_logp + actor_prediction = actions_scaled + actor_prediction_logp = action_logp + + alpha_loss = ( + -self.log_alpha * (action_logp + self.target_entropy).detach().mean() + ) + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) + q_value_1 = self.critic_1.forward(critic_in) + q_value_2 = self.critic_2.forward(critic_in) + actor_loss = ( + self.alpha.detach() * actor_prediction_logp + - torch.min(q_value_1, q_value_2) + ).mean() + self.actor_optimizer.zero_grad() + actor_loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.actor_optimizer.step() + + self.mean_alpha_loss += alpha_loss.item() + self.mean_actor_loss += actor_loss.item() diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 3caf5fec..8d4a3ab1 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -34,3 +34,4 @@ from .state_estimator import StateEstimatorNN from .actor import Actor from .critic import Critic +from .chimera_actor import ChimeraActor \ No newline at end of file diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py new file mode 100644 index 00000000..5902e3b7 --- /dev/null +++ b/learning/modules/chimera_actor.py @@ -0,0 +1,76 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal +from .utils import create_MLP +from .utils import export_network +from .utils import RunningMeanStd + + +class ChimeraActor(nn.Module): + def __init__( + self, + num_obs, + num_actions, + hidden_dims, + split_idx=1, + activation="tanh", + log_std_max: float = 4.0, + log_std_min: float = -20.0, + std_init: float = 1.0, + normalize_obs=True, + **kwargs, + ): + super().__init__() + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.log_std_max = log_std_max + self.log_std_min = log_std_min + self.log_std_init = torch.tensor([std_init]).log() # refactor + + self.num_obs = num_obs + self.num_actions = num_actions + + assert split_idx < len(hidden_dims) + self.latent_NN = create_MLP( + num_obs, hidden_dims[-split_idx], hidden_dims[:-split_idx], activation + ) + self.mean_NN = create_MLP( + hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + ) + self.std_NN = create_MLP( + hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + ) + + # maybe zap + self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) + Normal.set_default_validate_args = False + + def forward(self, x, deterministic=True): + if self._normalize_obs: + with torch.no_grad(): + x = self.obs_rms(x) + latent = self.latent_NN(x) + mean = self.mean_NN(latent) + if deterministic: + return mean + log_std = self.log_std_init + self.std_NN(latent) + return mean, log_std.clamp(self.log_std_min, self.log_std_max).exp() + + def act(self, x): + mean, std = self.forward(x, deterministic=False) + self.distribution = Normal(mean, std) + return self.distribution.sample() + + def inference_policy(self, x): + return self.forward(x, deterministic=True) + + def export(self, path): + export_network(self.inference_policy, "policy", path, self.num_obs) + + def to(self, device): + super().to(device) + self.log_std_init = self.log_std_init.to(device) + return self diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 8ef5395d..96c53438 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -20,11 +20,14 @@ def __init__( if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) - def evaluate(self, critic_observations): + def forward(self, x): if self._normalize_obs: with torch.no_grad(): - critic_observations = self.obs_rms(critic_observations) - return self.NN(critic_observations).squeeze() + x = self.obs_rms(x) + return self.NN(x).squeeze() + + def evaluate(self, critic_observations): + return self.forward(critic_observations) def loss_fn(self, obs, target): - return nn.functional.mse_loss(self.evaluate(obs), target, reduction="mean") + return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index e70971ed..fa03e319 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -5,7 +5,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner -from learning.modules import Actor, Critic +from learning.modules import Critic, ChimeraActor from learning.storage import DictStorage from learning.algorithms import SAC @@ -27,10 +27,20 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg) critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) - self.alg = SAC(actor, critic_1, critic_2, device=self.device, **self.alg_cfg) + target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + self.alg = SAC( + actor, + critic_1, + critic_2, + target_critic_1, + target_critic_2, + device=self.device, + **self.alg_cfg, + ) def learn(self): self.set_up_logger() @@ -49,7 +59,7 @@ def learn(self): { "actor_obs": actor_obs, "next_actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], @@ -70,7 +80,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -152,9 +162,16 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + [ + "mean_critic_1_loss", + "mean_critic_2_loss", + "mean_actor_loss", + "mean_alpha_loss", + ], ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb( (self.alg.actor, self.alg.critic_1, self.alg.critic_2) @@ -163,27 +180,37 @@ def set_up_logger(self): def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) - torch.save( - { - "actor_state_dict": self.alg.actor.state_dict(), - "critic_1_state_dict": self.alg.critic_1.state_dict(), - "critic_2_state_dict": self.alg.critic_2.state_dict(), - "optimizer_state_dict": self.alg.optimizer.state_dict(), - "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), - "iter": self.it, - }, - path, - ) + save_dict = { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_1_state_dict": self.alg.critic_1.state_dict(), + "critic_2_state_dict": self.alg.critic_2.state_dict(), + "log_alpha": self.alg.log_alpha, + "actor_optimizer_state_dict": self.alg.actor_optimizer.state_dict(), + "critic_1_optimizer_state_dict": self.alg.critic_1_optimizer.state_dict(), + "critic_2_optimizer_state_dict": self.alg.critic_2_optimizer.state_dict(), + "log_alpha_optimizer_state_dict": self.alg.log_alpha_optimizer.state_dict(), + "iter": self.it, + } + torch.save(save_dict, path) def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"]) self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"]) + self.log_alpha = loaded_dict["log_alpha"] if load_optimizer: - self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) - self.alg.critic_optimizer.load_state_dict( - loaded_dict["critic_optimizer_state_dict"] + self.alg.actor_optimizer.load_state_dict( + loaded_dict["actor_optimizer_state_dict"] + ) + self.alg.critic_1_optimizer.load_state_dict( + loaded_dict["critic_1_optimizer_state_dict"] + ) + self.alg.critic_2_optimizer.load_state_dict( + loaded_dict["critic_2_optimizer_state_dict"] + ) + self.alg.log_alpha_optimizer.load_state_dict( + loaded_dict["log_alpha_optimizer_state_dict"] ) self.it = loaded_dict["iter"] @@ -194,7 +221,7 @@ def switch_to_eval(self): def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) - return self.alg.actor.act_inference(obs) + return self.alg.act(obs) # todo inference mode def export(self, path): self.alg.actor.export(path) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 6a4a4265..844f58a5 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -38,7 +38,7 @@ def learn(self): { "actor_obs": actor_obs, "next_actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], @@ -59,7 +59,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index 15c674d6..b39dc16d 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,7 +1,7 @@ - from .utils import ( remove_zero_weighted_rewards, set_discount_from_horizon, + polyak_update ) from .dict_utils import * from .logger import Logger diff --git a/learning/utils/utils.py b/learning/utils/utils.py index 7a54c65a..d4eac980 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -17,3 +17,9 @@ def set_discount_from_horizon(dt, horizon): discount_factor = 1 - 1 / discrete_time_horizon return discount_factor + + +def polyak_update(online, target, polyak_factor): + for op, tp in zip(online.parameters(), target.parameters()): + tp.data.copy_((1.0 - polyak_factor) * op.data + polyak_factor * tp.data) + return target From c542214c5cb863acdfe111a46b156ec00f64ea2f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 28 Mar 2024 18:27:13 -0400 Subject: [PATCH 11/43] update fixed_robot to match legged_robot refactor --- gym/envs/base/fixed_robot.py | 92 ++++++++++++---------------- gym/envs/pendulum/pendulum.py | 4 -- gym/envs/pendulum/pendulum_config.py | 4 +- 3 files changed, 40 insertions(+), 60 deletions(-) diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 1fe4c5d5..c3cf0566 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): self.reset() def step(self): - """Apply actions, simulate, call self.post_physics_step() - and pre_physics_step() - - Args: - actions (torch.Tensor): Tensor of shape - (num_envs, num_actions_per_env) - """ - self._reset_buffers() - self._pre_physics_step() - # * step physics and render each frame + self._pre_decimation_step() self._render() for _ in range(self.cfg.control.decimation): + self._pre_compute_torques() self.torques = self._compute_torques() - - if self.cfg.asset.disable_motors: - self.torques[:] = 0.0 - torques_to_gym_tensor = torch.zeros( - self.num_envs, self.num_dof, device=self.device - ) - - # todo encapsulate - next_torques_idx = 0 - for dof_idx in range(self.num_dof): - if self.cfg.control.actuated_joints_mask[dof_idx]: - torques_to_gym_tensor[:, dof_idx] = self.torques[ - :, next_torques_idx - ] - next_torques_idx += 1 - else: - torques_to_gym_tensor[:, dof_idx] = torch.zeros( - self.num_envs, device=self.device - ) - - self.gym.set_dof_actuation_force_tensor( - self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor) - ) - self.gym.simulate(self.sim) - if self.device == "cpu": - self.gym.fetch_results(self.sim, True) - self.gym.refresh_dof_state_tensor(self.sim) - - self._post_physics_step() + self._post_compute_torques() + self._step_physx_sim() + self._post_physx_step() + self._post_decimation_step() self._check_terminations_and_timeouts() env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten() self._reset_idx(env_ids) - def _pre_physics_step(self): - pass + def _pre_decimation_step(self): + return None + + def _pre_compute_torques(self): + return None + + def _post_compute_torques(self): + if self.cfg.asset.disable_motors: + self.torques[:] = 0.0 - def _post_physics_step(self): + def _step_physx_sim(self): + next_torques_idx = 0 + torques_to_gym_tensor = torch.zeros( + self.num_envs, self.num_dof, device=self.device + ) + for dof_idx in range(self.num_dof): + if self.cfg.control.actuated_joints_mask[dof_idx]: + torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx] + next_torques_idx += 1 + else: + torques_to_gym_tensor[:, dof_idx] = torch.zeros( + self.num_envs, device=self.device + ) + self.gym.set_dof_actuation_force_tensor( + self.sim, gymtorch.unwrap_tensor(self.torques) + ) + self.gym.simulate(self.sim) + if self.device == "cpu": + self.gym.fetch_results(self.sim, True) + self.gym.refresh_dof_state_tensor(self.sim) + + def _post_physx_step(self): """ check terminations, compute observations and rewards """ self.gym.refresh_actor_root_state_tensor(self.sim) self.gym.refresh_net_contact_force_tensor(self.sim) + def _post_decimation_step(self): self.episode_length_buf += 1 self.common_step_counter += 1 @@ -212,18 +208,6 @@ def _process_rigid_body_props(self, props, env_id): return props def _compute_torques(self): - """Compute torques from actions. - Actions can be interpreted as position or velocity targets given - to a PD controller, or directly as scaled torques. - [NOTE]: torques must have the same dimension as the number of DOFs, - even if some DOFs are not actuated. - - Args: - actions (torch.Tensor): Actions - - Returns: - [torch.Tensor]: Torques sent to the simulation - """ actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 95af1fb2..77b05f10 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -4,10 +4,6 @@ class Pendulum(FixedRobot): - def _post_physics_step(self): - """Update all states that are not handled in PhysX""" - super()._post_physics_step() - def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 5e2e0e01..befc134d 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OffPolicyRunner" + runner_class_name = "OnPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -122,5 +122,5 @@ class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "SAC" + algorithm_class_name = "PPO2" num_steps_per_env = 32 From a5ca4fd0f7030debff27d8440725b868a921232b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 28 Mar 2024 18:58:49 -0400 Subject: [PATCH 12/43] WIP: separate config for pendulum SAC, comments on missing components --- gym/envs/__init__.py | 4 +- gym/envs/pendulum/pendulum_SAC_config.py | 72 ++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 gym/envs/pendulum/pendulum_SAC_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 57cbd8c0..824ac8ce 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -44,6 +44,7 @@ "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", + "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config", } task_dict = { @@ -66,7 +67,8 @@ "HumanoidRunningRunnerCfg", ], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], - "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], + "sac_pendulum": ["Pendulum", "PendulumCfg", "PendulumSACRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py new file mode 100644 index 00000000..840346ee --- /dev/null +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -0,0 +1,72 @@ +from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO + + +class PendulumSACRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "OffPolicyRunner" + + class actor: + hidden_dims = [128, 64, 32, 32] + split_idx = 2 + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + obs = [ + "dof_pos", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.05 + dof_vel = 0.0 + torques = 0.01 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + batch_size = 2**17 + max_gradient_steps = 10 + action_max = 5.0 + action_min = -5.0 + actor_noise_std = 1.0 + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -1.0 + max_grad_norm = 1.0 + polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) + gamma = 0.98 + alpha_lr = 1e-4 + actor_lr = 1e-5 + critic_lr = 1e-5 + # gSDE parameters missing: batch_size = 256!!!, but batch_size ~2**17 + # warm-up steps + # auto entropy coefficient (alpha) + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 500 # number of policy updates + algorithm_class_name = "SAC" + num_steps_per_env = 32 From cf5cecdb3b7ac2c192e959d7e3f832040741aec8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 1 Apr 2024 10:01:32 -0400 Subject: [PATCH 13/43] WIP: partly fill replay buffer before training, bugfix on generator, small fixes --- gym/envs/pendulum/pendulum_SAC_config.py | 7 +++- gym/envs/pendulum/pendulum_config.py | 2 +- learning/runners/off_policy_runner.py | 50 +++++++++++++++++++++++- learning/storage/__init__.py | 3 +- learning/storage/replay_buffer.py | 48 +++++++++++++++++++++++ learning/utils/dict_utils.py | 3 ++ scripts/train.py | 2 - 7 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 learning/storage/replay_buffer.py diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 840346ee..a6fe30d7 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -45,7 +45,9 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - batch_size = 2**17 + initial_fill = 1000 + storage_size = 10**17 + batch_size = 256 max_gradient_steps = 10 action_max = 5.0 action_min = -5.0 @@ -69,4 +71,5 @@ class runner(FixedRobotCfgPPO.runner): experiment_name = "pendulum" max_iterations = 500 # number of policy updates algorithm_class_name = "SAC" - num_steps_per_env = 32 + save_interval = 10 + num_steps_per_env = 100 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index befc134d..253e4165 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -33,7 +33,7 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 + ctrl_frequency = 20 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index fa03e319..380b12cd 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -6,11 +6,11 @@ from .BaseRunner import BaseRunner from learning.modules import Critic, ChimeraActor -from learning.storage import DictStorage +from learning.storage import ReplayBuffer from learning.algorithms import SAC logger = Logger() -storage = DictStorage() +storage = ReplayBuffer() class OffPolicyRunner(BaseRunner): @@ -73,6 +73,52 @@ def learn(self): device=self.device, ) + # fill buffer + for _ in range(self.alg_cfg["initial_fill"]): + with torch.inference_mode(): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + # print every 10% of initial fill + if _ % (self.alg_cfg["initial_fill"] // 10) == 0: + print(f"Filled {100 * _ / self.alg_cfg['initial_fill']}%") + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index f10df85f..47f2593b 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,4 +3,5 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage -from .dict_storage import DictStorage \ No newline at end of file +from .dict_storage import DictStorage +from .replay_buffer import ReplayBuffer \ No newline at end of file diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py new file mode 100644 index 00000000..889cc69b --- /dev/null +++ b/learning/storage/replay_buffer.py @@ -0,0 +1,48 @@ +import torch +from tensordict import TensorDict + + +class ReplayBuffer: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) + self.fill_count = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + else: + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + + @torch.inference_mode + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length: + self.fill_count = 0 + self.data[self.fill_count] = transition + self.fill_count += 1 + + def clear(self): + self.fill_count = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 3b102338..7e5c6c79 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -60,10 +60,13 @@ def create_uniform_generator( total_data = n * m if batch_size > total_data: + Warning("Batch size is larger than total data, using available data only.") batch_size = total_data num_batches_per_epoch = total_data // batch_size if max_gradient_steps: + if max_gradient_steps < num_batches_per_epoch: + num_batches_per_epoch = max_gradient_steps num_epochs = max_gradient_steps // num_batches_per_epoch num_epochs = max(num_epochs, 1) diff --git a/scripts/train.py b/scripts/train.py index 283717e8..759992cf 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -12,8 +12,6 @@ def setup(): task_registry.make_gym_and_sim() wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) - randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) From 82e2179b8e71604550836cdb757cdb8f2afce9db Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 1 Apr 2024 18:14:45 -0400 Subject: [PATCH 14/43] switch back to elu, remove grad clipping --- gym/envs/pendulum/pendulum_SAC_config.py | 20 ++++++++++---------- gym/envs/pendulum/pendulum_config.py | 6 +++--- learning/algorithms/sac.py | 22 ++++++++++++++-------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index a6fe30d7..e8726faa 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -6,10 +6,10 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor: - hidden_dims = [128, 64, 32, 32] + hidden_dims = [128, 64, 32] split_idx = 2 # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" obs = [ "dof_pos", @@ -28,16 +28,16 @@ class critic: "dof_pos", "dof_vel", ] - hidden_dims = [128, 64, 32] + hidden_dims = [256, 256] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" class reward: class weights: theta = 0.0 omega = 0.0 - equilibrium = 1.0 - energy = 0.05 + equilibrium = 10.0 + energy = 0.5 dof_vel = 0.0 torques = 0.01 @@ -48,7 +48,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): initial_fill = 1000 storage_size = 10**17 batch_size = 256 - max_gradient_steps = 10 + max_gradient_steps = 1 # 10 # SB3: 1 action_max = 5.0 action_min = -5.0 actor_noise_std = 1.0 @@ -59,9 +59,9 @@ class algorithm(FixedRobotCfgPPO.algorithm): max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.98 - alpha_lr = 1e-4 - actor_lr = 1e-5 - critic_lr = 1e-5 + alpha_lr = 3e-4 + actor_lr = 3e-4 + critic_lr = 3e-4 # gSDE parameters missing: batch_size = 256!!!, but batch_size ~2**17 # warm-up steps # auto entropy coefficient (alpha) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 253e4165..7944e5b2 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -33,7 +33,7 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 20 + ctrl_frequency = 100 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -43,7 +43,7 @@ class asset(FixedRobotCfg.asset): file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" disable_gravity = False disable_motors = False # all torques set to 0 - joint_damping = 0.1 + joint_damping = 0.5 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 1.0 + tau_ff = 5.0 class PendulumRunnerCfg(FixedRobotCfgPPO): diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 52be75ed..3c418474 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -1,5 +1,6 @@ import torch -import torch.nn as nn + +# import torch.nn as nn import torch.optim as optim from learning.utils import create_uniform_generator, polyak_update @@ -174,8 +175,12 @@ def update_critic(self, batch): actions_normalized * self.action_delta + self.action_offset ).clamp(self.action_min, self.action_max) ## * - action_logp = distribution.log_prob(actions).sum(-1) - torch.log( - 1.0 - actions_normalized.pow(2) + 1e-6 + # action_logp = distribution.log_prob(actions).sum(-1) - torch.log( + # 1.0 - actions_normalized.pow(2) + 1e-6 + # ).sum(-1) + action_logp = ( + distribution.log_prob(actions) + - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) ).sum(-1) # * returns target_action = actions_scaled, target_action_logp = action_logp @@ -200,14 +205,14 @@ def update_critic(self, batch): critic_loss_1 = self.critic_1.loss_fn(critic_in, target) self.critic_1_optimizer.zero_grad() critic_loss_1.backward() - nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) + # nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) self.critic_1_optimizer.step() # critic_prediction_2 = self.critic_2.forward(critic_in) critic_loss_2 = self.critic_2.loss_fn(critic_in, target) self.critic_2_optimizer.zero_grad() critic_loss_2.backward() - nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) + # nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) self.critic_2_optimizer.step() self.mean_critic_1_loss += critic_loss_1.item() @@ -239,8 +244,9 @@ def update_actor_and_alpha(self, batch): actor_prediction_logp = action_logp alpha_loss = ( - -self.log_alpha * (action_logp + self.target_entropy).detach().mean() - ) + -self.log_alpha * (action_logp + self.target_entropy).detach() + ).mean() + # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() self.log_alpha_optimizer.zero_grad() alpha_loss.backward() @@ -255,7 +261,7 @@ def update_actor_and_alpha(self, batch): ).mean() self.actor_optimizer.zero_grad() actor_loss.backward() - nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() self.mean_alpha_loss += alpha_loss.item() From 7da0fab0a336b15f0d87d81e6a60e16f401238dd Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 4 Apr 2024 09:11:22 -0400 Subject: [PATCH 15/43] match URDF, obs, ctrl freq, etc to gymansium --- gym/envs/pendulum/pendulum.py | 14 ++++++++++++++ gym/envs/pendulum/pendulum_SAC_config.py | 2 +- gym/envs/pendulum/pendulum_config.py | 14 +++++++------- resources/robots/pendulum/urdf/pendulum.urdf | 4 ++-- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 77b05f10..b9004c86 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -4,6 +4,20 @@ class Pendulum(FixedRobot): + def _init_buffers(self): + super()._init_buffers() + self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) + + def _post_decimation_step(self): + super()._post_decimation_step() + self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) + + def _reset_system(self, env_ids): + super()._reset_system(env_ids) + self.dof_pos_obs[env_ids] = torch.cat( + [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + ) + def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index e8726faa..d03bdc52 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -12,7 +12,7 @@ class actor: activation = "elu" obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 7944e5b2..5f21cf9a 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -27,14 +27,14 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi / 4, torch.pi / 4], + "theta": [-torch.pi, torch.pi], } dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 - desired_sim_frequency = 200 + ctrl_frequency = 20 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 5.0 + tau_ff = 2.0 class PendulumRunnerCfg(FixedRobotCfgPPO): @@ -62,7 +62,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" obs = [ "dof_pos", @@ -83,14 +83,14 @@ class critic: ] hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.05 + energy = 0.0 dof_vel = 0.0 torques = 0.01 diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf index 90e894e4..d94bffea 100644 --- a/resources/robots/pendulum/urdf/pendulum.urdf +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -47,8 +47,8 @@ - - + + From 1feafdecf7d941f86a872c89efa2c9b51bdea717 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 4 Apr 2024 09:59:32 -0400 Subject: [PATCH 16/43] uniform sampling during initial fill, proper inference actions for play --- gym/envs/pendulum/pendulum_SAC_config.py | 4 ++-- gym/envs/pendulum/pendulum_config.py | 2 +- learning/algorithms/sac.py | 4 ++-- learning/runners/off_policy_runner.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index d03bdc52..2ba4c611 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -49,8 +49,8 @@ class algorithm(FixedRobotCfgPPO.algorithm): storage_size = 10**17 batch_size = 256 max_gradient_steps = 1 # 10 # SB3: 1 - action_max = 5.0 - action_min = -5.0 + action_max = 1.0 + action_min = -1.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 5f21cf9a..ea0b1fcc 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_basic" # * initial conditions for reset_to_range dof_pos_range = { diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 3c418474..24f09587 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -16,8 +16,8 @@ def __init__( target_critic_2, batch_size=2**15, max_gradient_steps=10, - action_max=10.0, - action_min=-10.0, + action_max=1.0, + action_min=-1.0, actor_noise_std=1.0, log_std_max=4.0, log_std_min=-20.0, diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 380b12cd..926683ac 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -50,6 +50,7 @@ def learn(self): self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) + actions = self.alg.act(actor_obs) tot_iter = self.it + self.num_learning_iterations self.save() @@ -76,7 +77,7 @@ def learn(self): # fill buffer for _ in range(self.alg_cfg["initial_fill"]): with torch.inference_mode(): - actions = self.alg.act(actor_obs) + actions = torch.rand_like(actions) * 2 - 1 self.set_actions( self.actor_cfg["actions"], actions, @@ -267,7 +268,12 @@ def switch_to_eval(self): def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) - return self.alg.act(obs) # todo inference mode + mean = self.alg.actor.forward(obs) # todo inference mode + actions = torch.tanh(mean) + actions = (actions * self.alg.action_delta + self.alg.action_offset).clamp( + self.alg.action_min, self.alg.action_max + ) + return actions def export(self, path): self.alg.actor.export(path) From 1969e0deab0aa458bd7c9ee97ee05c4fa2d5abb7 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 5 Apr 2024 09:16:47 -0400 Subject: [PATCH 17/43] WIP: fix polyak bug, some more tuning --- gym/envs/base/fixed_robot_config.py | 2 +- gym/envs/base/legged_robot_config.py | 2 +- gym/envs/pendulum/pendulum_SAC_config.py | 17 ++++++----- gym/envs/pendulum/pendulum_config.py | 32 ++++++++++---------- learning/algorithms/sac.py | 30 +++++++++--------- learning/runners/off_policy_runner.py | 1 + resources/robots/pendulum/urdf/pendulum.urdf | 4 +-- 7 files changed, 45 insertions(+), 43 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index fbecced6..4ced4366 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -170,7 +170,7 @@ class algorithm: lam = 0.95 # shared batch_size = 2**15 - max_grad_steps = 10 + max_gradient_steps = 10 # new storage_size = 2**17 # new batch_size = 2**15 # new diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 43b09e68..4ce1bc8f 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -293,7 +293,7 @@ class algorithm: lam = 0.95 # shared batch_size = 2**15 - max_grad_steps = 10 + max_gradient_steps = 10 # new storage_size = 2**17 # new batch_size = 2**15 # new diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 2ba4c611..8071d7e4 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -11,11 +11,11 @@ class actor: # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + normalize_obs = False obs = [ "dof_pos_obs", "dof_vel", ] - actions = ["tau_ff"] disable_actions = False @@ -25,30 +25,31 @@ class noise: class critic: obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] hidden_dims = [256, 256] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + normalize_obs = False class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 10.0 - energy = 0.5 + energy = 1.0 dof_vel = 0.0 - torques = 0.01 + torques = 0.001 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 1000 - storage_size = 10**17 + initial_fill = 10**3 + storage_size = 10**6 # 17 batch_size = 256 - max_gradient_steps = 1 # 10 # SB3: 1 + max_gradient_steps = 100 # 10 # SB3: 1 action_max = 1.0 action_min = -1.0 actor_noise_std = 1.0 @@ -72,4 +73,4 @@ class runner(FixedRobotCfgPPO.runner): max_iterations = 500 # number of policy updates algorithm_class_name = "SAC" save_interval = 10 - num_steps_per_env = 100 + num_steps_per_env = 64 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index ea0b1fcc..b274192c 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -23,18 +23,18 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_basic" + reset_mode = "reset_to_range" # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi, torch.pi], + "theta": [-torch.pi / 4, torch.pi / 4], } dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 20 - desired_sim_frequency = 100 + ctrl_frequency = 100 + desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -43,7 +43,7 @@ class asset(FixedRobotCfg.asset): file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" disable_gravity = False disable_motors = False # all torques set to 0 - joint_damping = 0.5 + joint_damping = 0.1 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 2.0 + tau_ff = 1.0 class PendulumRunnerCfg(FixedRobotCfgPPO): @@ -62,7 +62,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" obs = [ "dof_pos", @@ -83,14 +83,14 @@ class critic: ] hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.05 dof_vel = 0.0 torques = 0.01 @@ -100,22 +100,22 @@ class termination_weight: class algorithm(FixedRobotCfgPPO.algorithm): # both gamma = 0.99 - lam = 0.95 + discount_horizon = 2.0 + lam = 0.98 # shared - batch_size = 2**15 - max_grad_steps = 10 + max_gradient_steps = 10 # new storage_size = 2**17 # new - batch_size = 2**15 # new + batch_size = 2**16 # new clip_param = 0.2 - learning_rate = 1.0e-3 + learning_rate = 1.0e-4 max_grad_norm = 1.0 # Critic use_clipped_value_loss = True # Actor - entropy_coef = 0.1 - schedule = "adaptive" # could be adaptive, fixed + entropy_coef = 0.01 + schedule = "fixed" # could be adaptive, fixed desired_kl = 0.01 class runner(FixedRobotCfgPPO.runner): diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 24f09587..7305d786 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -139,14 +139,14 @@ def update(self, data): self.update_critic(batch) self.update_actor_and_alpha(batch) - # Update Target Networks - self.target_critic_1 = polyak_update( - self.critic_1, self.target_critic_1, self.polyak - ) - self.target_critic_1 = polyak_update( - self.critic_1, self.target_critic_1, self.polyak - ) count += 1 + # Update Target Networks + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak + ) + self.target_critic_2 = polyak_update( + self.critic_2, self.target_critic_2, self.polyak + ) self.mean_actor_loss /= count self.mean_alpha_loss /= count self.mean_critic_1_loss /= count @@ -243,14 +243,14 @@ def update_actor_and_alpha(self, batch): actor_prediction = actions_scaled actor_prediction_logp = action_logp - alpha_loss = ( - -self.log_alpha * (action_logp + self.target_entropy).detach() - ).mean() - # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() + # alpha_loss = ( + # -self.log_alpha * (action_logp + self.target_entropy).detach() + # ).mean() + # # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() + # self.log_alpha_optimizer.zero_grad() + # alpha_loss.backward() + # self.log_alpha_optimizer.step() critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) q_value_1 = self.critic_1.forward(critic_in) @@ -264,5 +264,5 @@ def update_actor_and_alpha(self, batch): # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() - self.mean_alpha_loss += alpha_loss.item() + # self.mean_alpha_loss += alpha_loss.item() self.mean_actor_loss += actor_loss.item() diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 926683ac..31556725 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -216,6 +216,7 @@ def set_up_logger(self): "mean_critic_2_loss", "mean_actor_loss", "mean_alpha_loss", + "alpha", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf index d94bffea..90e894e4 100644 --- a/resources/robots/pendulum/urdf/pendulum.urdf +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -47,8 +47,8 @@ - - + + From b69f8c077355c3c147dadd45c4ff946211ebff52 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 8 Apr 2024 14:51:17 -0400 Subject: [PATCH 18/43] normalize advantages only after computing returns --- gym/envs/pendulum/pendulum_config.py | 2 +- learning/algorithms/ppo2.py | 3 ++- learning/utils/dict_utils.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index b274192c..b25e0455 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -103,7 +103,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): discount_horizon = 2.0 lam = 0.98 # shared - max_gradient_steps = 10 + max_gradient_steps = 24 # new storage_size = 2**17 # new batch_size = 2**16 # new diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 3e5f454d..09c60ab9 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -5,6 +5,7 @@ from learning.utils import ( create_uniform_generator, compute_generalized_advantages, + normalize, ) @@ -62,8 +63,8 @@ def update(self, data): data, self.gamma, self.lam, self.critic ) data["returns"] = data["advantages"] + data["values"] - self.update_critic(data) + data["advantages"] = normalize(data["advantages"]) self.update_actor(data) def update_critic(self, data): diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 7e5c6c79..99534c22 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -48,7 +48,7 @@ def compute_generalized_advantages(data, gamma, lam, critic): ) advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - return normalize(advantages) + return advantages # todo change num_epochs to num_batches From 7db3f141a0f27938d5f9cf624adbecbda80c13c8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 11 Apr 2024 12:44:14 -0400 Subject: [PATCH 19/43] WIP: some tuning and fixes --- gym/envs/pendulum/pendulum_config.py | 6 +++--- learning/algorithms/sac.py | 21 ++++++++++++++------- learning/modules/chimera_actor.py | 2 ++ learning/runners/off_policy_runner.py | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index b25e0455..d13931d9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**13 + num_envs = 2**8 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_basic" # * initial conditions for reset_to_range dof_pos_range = { @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 1.0 + tau_ff = 10.0 class PendulumRunnerCfg(FixedRobotCfgPPO): diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 7305d786..c079c1f6 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -84,7 +84,6 @@ def __init__( # self.max_grad_norm = max_grad_norm # self.use_clipped_value_loss = use_clipped_value_loss # * SAC parameters - self.learning_starts = 100 self.batch_size = batch_size self.polyak = polyak self.gamma = gamma @@ -195,8 +194,9 @@ def update_critic(self, batch): target_next = ( torch.min(target_critic_prediction_1, target_critic_prediction_2) - - self.alpha * target_action_logp + - self.alpha.detach() * target_action_logp ) + # the detach inside torch.no_grad() should be redundant target = rewards + self.gamma * dones.logical_not() * target_next critic_in = torch.cat((critic_obs, actions), dim=-1) @@ -243,14 +243,21 @@ def update_actor_and_alpha(self, batch): actor_prediction = actions_scaled actor_prediction_logp = action_logp + # entropy loss + alpha_loss = -( + self.log_alpha * (action_logp + self.target_entropy).detach() + ).mean() + # alpha_loss = ( # -self.log_alpha * (action_logp + self.target_entropy).detach() # ).mean() - # # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() + # alpha_loss = ( + # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() + # ) - # self.log_alpha_optimizer.zero_grad() - # alpha_loss.backward() - # self.log_alpha_optimizer.step() + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) q_value_1 = self.critic_1.forward(critic_in) @@ -264,5 +271,5 @@ def update_actor_and_alpha(self, batch): # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() - # self.mean_alpha_loss += alpha_loss.item() + self.mean_alpha_loss += alpha_loss.item() self.mean_actor_loss += actor_loss.item() diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index 5902e3b7..76cb1b1e 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -44,6 +44,8 @@ def __init__( hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation ) + # nn.init.orthogonal_() + # maybe zap self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) Normal.set_default_validate_args = False diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 31556725..cd93b39e 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -269,7 +269,7 @@ def switch_to_eval(self): def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) - mean = self.alg.actor.forward(obs) # todo inference mode + mean = self.alg.actor.forward(obs) actions = torch.tanh(mean) actions = (actions * self.alg.action_delta + self.alg.action_offset).clamp( self.alg.action_min, self.alg.action_max From fe0a489e9e8ebfd1627fcdb2cb709dc8055f3017 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 11 Apr 2024 18:13:30 -0400 Subject: [PATCH 20/43] WIP: some more reasonable tuning for single env --- gym/envs/pendulum/pendulum_SAC_config.py | 19 ++++++++----------- gym/envs/pendulum/pendulum_config.py | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 8071d7e4..186e78c6 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -46,31 +46,28 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 10**3 + initial_fill = 256 storage_size = 10**6 # 17 batch_size = 256 - max_gradient_steps = 100 # 10 # SB3: 1 + max_gradient_steps = 10 # 10 # SB3: 1 action_max = 1.0 action_min = -1.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 alpha = 0.2 - target_entropy = -1.0 + target_entropy = 1.0 # -1.0 max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.98 - alpha_lr = 3e-4 - actor_lr = 3e-4 - critic_lr = 3e-4 - # gSDE parameters missing: batch_size = 256!!!, but batch_size ~2**17 - # warm-up steps - # auto entropy coefficient (alpha) + alpha_lr = 3e-5 + actor_lr = 3e-5 + critic_lr = 3e-5 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates algorithm_class_name = "SAC" - save_interval = 10 - num_steps_per_env = 64 + save_interval = 50 + num_steps_per_env = 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index d13931d9..9aadb7d5 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**8 + num_envs = 1 # 2**8 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 From 38087410e05709436e2526c1edfdcd6018afa90a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 12 Apr 2024 13:14:33 -0400 Subject: [PATCH 21/43] refactor chimera actor hidden layers --- gym/envs/pendulum/pendulum_SAC_config.py | 15 +++++++++++---- gym/envs/pendulum/pendulum_config.py | 2 +- learning/modules/chimera_actor.py | 17 ++++++++++++----- learning/runners/off_policy_runner.py | 3 +++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 186e78c6..e473e668 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -6,10 +6,17 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor: - hidden_dims = [128, 64, 32] - split_idx = 2 + hidden_dims = { + "latent": [128, 128], + "mean": [64, 32], + "std": [64, 32], + } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = { + "latent": "elu", + "mean": "elu", + "std": "tanh", + } normalize_obs = False obs = [ @@ -46,7 +53,7 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 256 + initial_fill = 2 storage_size = 10**6 # 17 batch_size = 256 max_gradient_steps = 10 # 10 # SB3: 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 9aadb7d5..d13931d9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 1 # 2**8 + num_envs = 2**8 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index 76cb1b1e..f2cad4de 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -12,7 +12,6 @@ def __init__( num_obs, num_actions, hidden_dims, - split_idx=1, activation="tanh", log_std_max: float = 4.0, log_std_min: float = -20.0, @@ -33,15 +32,23 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - assert split_idx < len(hidden_dims) self.latent_NN = create_MLP( - num_obs, hidden_dims[-split_idx], hidden_dims[:-split_idx], activation + num_obs, + hidden_dims["latent"][-1], + hidden_dims["latent"][:-1], + activation["latent"], ) self.mean_NN = create_MLP( - hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + hidden_dims["latent"][-1], + num_actions, + hidden_dims["mean"], + activation["mean"], ) self.std_NN = create_MLP( - hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + hidden_dims["latent"][-1], + num_actions, + hidden_dims["std"], + activation["std"], ) # nn.init.orthogonal_() diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index cd93b39e..b8e5e652 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -32,6 +32,9 @@ def _set_up_alg(self): critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + print(actor) + self.alg = SAC( actor, critic_1, From 5636bfcc370a0dca4bed7ba8f81c02cd9df26510 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 12 Apr 2024 13:21:56 -0400 Subject: [PATCH 22/43] quickfix of init_fill prinout --- learning/runners/off_policy_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index b8e5e652..b9269234 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -120,7 +120,9 @@ def learn(self): ) storage.add_transitions(transition) # print every 10% of initial fill - if _ % (self.alg_cfg["initial_fill"] // 10) == 0: + if (self.alg_cfg["initial_fill"] > 10) and ( + _ % (self.alg_cfg["initial_fill"] // 10) == 0 + ): print(f"Filled {100 * _ / self.alg_cfg['initial_fill']}%") logger.tic("runtime") From cb5175283b2603aefc6429083ecc155c4766b549 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 07:47:05 -0400 Subject: [PATCH 23/43] fix replay buffer - data was not being added correctly - runner was indexing along num_envs instead of time - use data getter to handle overfill --- learning/runners/off_policy_runner.py | 5 ++--- learning/storage/replay_buffer.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index b9269234..e57be9a9 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -73,7 +73,7 @@ def learn(self): storage.initialize( transition, self.env.num_envs, - self.env.num_envs * self.num_steps_per_env, + self.alg_cfg["storage_size"], device=self.device, ) @@ -179,8 +179,7 @@ def learn(self): logger.toc("collection") logger.tic("learning") - self.alg.update(storage.data) - storage.clear() + self.alg.update(storage.get_data()) logger.toc("learning") logger.log_all_categories() diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py index 889cc69b..de252f8a 100644 --- a/learning/storage/replay_buffer.py +++ b/learning/storage/replay_buffer.py @@ -19,6 +19,7 @@ def initialize( self.max_length = max_length self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) self.fill_count = 0 + self.add_index = 0 for key in dummy_dict.keys(): if dummy_dict[key].dim() == 1: # if scalar @@ -36,13 +37,18 @@ def initialize( @torch.inference_mode def add_transitions(self, transition: TensorDict): - if self.fill_count >= self.max_length: - self.fill_count = 0 - self.data[self.fill_count] = transition + if self.fill_count >= self.max_length and self.add_index >= self.max_length: + self.add_index = 0 + self.data[self.add_index] = transition self.fill_count += 1 + self.add_index += 1 + + def get_data(self): + return self.data[: max(self.fill_count, self.max_length), :] def clear(self): self.fill_count = 0 + self.add_index = 0 with torch.inference_mode(): for tensor in self.data: tensor.zero_() From eda4827727f7ad8ebfda6fb4f4f454dee4073a8c Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 08:49:17 -0400 Subject: [PATCH 24/43] update configs to be closer to SB3 --- gym/envs/pendulum/pendulum.py | 3 ++- gym/envs/pendulum/pendulum_SAC_config.py | 16 ++++++++-------- gym/envs/pendulum/pendulum_config.py | 12 ++++++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index b9004c86..c7d9be99 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -50,4 +50,5 @@ def _reward_energy(self): ) desired_energy = m_pendulum * 9.81 * l_pendulum energy_error = kinetic_energy + potential_energy - desired_energy - return self._sqrdexp(energy_error / desired_energy) + return -(energy_error / desired_energy).pow(2) + # return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index e473e668..39fdf7da 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -8,14 +8,14 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = { "latent": [128, 128], - "mean": [64, 32], - "std": [64, 32], + "mean": [32], + "std": [32], } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = { "latent": "elu", "mean": "elu", - "std": "tanh", + "std": "elu", } normalize_obs = False @@ -35,7 +35,7 @@ class critic: "dof_pos_obs", "dof_vel", ] - hidden_dims = [256, 256] + hidden_dims = [64, 64] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = False @@ -53,16 +53,16 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 2 + initial_fill = 100 storage_size = 10**6 # 17 - batch_size = 256 - max_gradient_steps = 10 # 10 # SB3: 1 + batch_size = 256 # 4096 + max_gradient_steps = 1 # 10 # SB3: 1 action_max = 1.0 action_min = -1.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 - alpha = 0.2 + alpha = 0.8 target_entropy = 1.0 # -1.0 max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index d13931d9..2c42aedb 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**8 + num_envs = 1 # 4096 num_actuators = 1 # 1 for theta connecting base and pole - episode_length_s = 5.0 + episode_length_s = 25.0 class terrain(FixedRobotCfg.terrain): pass @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -27,13 +27,13 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi / 4, torch.pi / 4], + "theta": [-torch.pi / 2, torch.pi / 2], } dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 + ctrl_frequency = 25 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 10.0 + tau_ff = 5.0 class PendulumRunnerCfg(FixedRobotCfgPPO): From 67f75cc7702055e397c91c539dfcb670c2aa5f10 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 09:02:31 -0400 Subject: [PATCH 25/43] derp, min instead of max -_- derp --- gym/envs/pendulum/pendulum_SAC_config.py | 2 +- learning/storage/replay_buffer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 39fdf7da..ab16bf24 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -63,7 +63,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): log_std_max = 4.0 log_std_min = -20.0 alpha = 0.8 - target_entropy = 1.0 # -1.0 + target_entropy = -1.0 max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.98 diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py index de252f8a..350a2c25 100644 --- a/learning/storage/replay_buffer.py +++ b/learning/storage/replay_buffer.py @@ -44,7 +44,7 @@ def add_transitions(self, transition: TensorDict): self.add_index += 1 def get_data(self): - return self.data[: max(self.fill_count, self.max_length), :] + return self.data[: min(self.fill_count, self.max_length), :] def clear(self): self.fill_count = 0 From 4eb0966b1adef3ebc75220a96d1d816af430cc04 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 12:04:41 -0400 Subject: [PATCH 26/43] split environment config for SAC pendulum --- gym/envs/__init__.py | 5 ++-- gym/envs/pendulum/pendulum_SAC_config.py | 31 ++++++++++++++++++++++++ gym/envs/pendulum/pendulum_config.py | 6 ++--- scripts/play.py | 6 ++--- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 824ac8ce..f57f4247 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,7 +19,7 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "Pendulum": ".pendulum.pendulum", + "Pendulum": ".pendulum.pendulum" } config_dict = { @@ -32,6 +32,7 @@ "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", "PendulumCfg": ".pendulum.pendulum_config", + "PendulumSACCfg": ".pendulum.pendulum_SAC_config", } runner_config_dict = { @@ -68,7 +69,7 @@ ], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], - "sac_pendulum": ["Pendulum", "PendulumCfg", "PendulumSACRunnerCfg"], + "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index ab16bf24..e9f73c57 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -1,4 +1,35 @@ +import torch from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.pendulum.pendulum_config import PendulumCfg + + +class PendulumSACCfg(PendulumCfg): + class env(PendulumCfg.env): + num_envs = 1 + episode_length_s = 5.0 + + class init_state(PendulumCfg.init_state): + reset_mode = "reset_to_basic" + default_joint_angles = {"theta": 0.0} + dof_pos_range = { + "theta": [-torch.pi / 2, torch.pi / 2], + } + dof_vel_range = {"theta": [-1, 1]} + + class control(PendulumCfg.control): + ctrl_frequency = 25 + desired_sim_frequency = 200 + + class asset(PendulumCfg.asset): + joint_damping = 0.1 + + class reward_settings(PendulumCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(PendulumCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + tau_ff = 5.0 class PendulumSACRunnerCfg(FixedRobotCfgPPO): diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 2c42aedb..49534568 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,8 +5,8 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 1 # 4096 - num_actuators = 1 # 1 for theta connecting base and pole + num_envs = 4096 + num_actuators = 1 episode_length_s = 25.0 class terrain(FixedRobotCfg.terrain): @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_basic" + reset_mode = "reset_to_range" # * initial conditions for reset_to_range dof_pos_range = { diff --git a/scripts/play.py b/scripts/play.py index 4486daae..d832bf69 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -9,16 +9,16 @@ def setup(args): env_cfg, train_cfg = task_registry.create_cfgs(args) - env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) + env_cfg.env.num_envs = 32 if hasattr(env_cfg, "push_robots"): env_cfg.push_robots.toggle = False if hasattr(env_cfg, "commands"): env_cfg.commands.resampling_time = 9999 - env_cfg.env.episode_length_s = 9999 + env_cfg.env.episode_length_s = 5 env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() + env_cfg.init_state.reset_mode = "reset_to_range" env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) From 5ee76742c6c6e78cb33003cb1cf5f191641965aa Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 22 Apr 2024 11:55:38 -0400 Subject: [PATCH 27/43] WIP: random tweaks and tuning before switching to other stuff --- gym/envs/base/task_skeleton.py | 2 +- gym/envs/pendulum/pendulum.py | 4 --- gym/envs/pendulum/pendulum_SAC_config.py | 26 +++++++++--------- learning/algorithms/sac.py | 24 ++++++++++++++--- learning/modules/chimera_actor.py | 34 +++++++++++------------- learning/runners/off_policy_runner.py | 5 ++++ scripts/play.py | 2 +- 7 files changed, 57 insertions(+), 40 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index ffbac6ce..6a1b352c 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -67,7 +67,7 @@ def _eval_reward(self, name): def _check_terminations_and_timeouts(self): """Check if environments need to be reset""" contact_forces = self.contact_forces[:, self.termination_contact_indices, :] - self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) self.timed_out = self.episode_length_buf >= self.max_episode_length self.to_be_reset = self.timed_out | self.terminated diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index c7d9be99..c22c034a 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -35,10 +35,6 @@ def _reward_equilibrium(self): # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma # ) - def _reward_torques(self): - """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) - def _reward_energy(self): m_pendulum = 1.0 l_pendulum = 1.0 diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index e9f73c57..aa182e15 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -5,8 +5,8 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): - num_envs = 1 - episode_length_s = 5.0 + num_envs = 256 + episode_length_s = 2.5 class init_state(PendulumCfg.init_state): reset_mode = "reset_to_basic" @@ -29,7 +29,7 @@ class reward_settings(PendulumCfg.reward_settings): class scaling(PendulumCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi - tau_ff = 5.0 + tau_ff = 1.0 class PendulumSACRunnerCfg(FixedRobotCfgPPO): @@ -38,9 +38,9 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = { - "latent": [128, 128], - "mean": [32], - "std": [32], + "latent": [400], + "mean": [300], + "std": [300], } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = { @@ -66,25 +66,25 @@ class critic: "dof_pos_obs", "dof_vel", ] - hidden_dims = [64, 64] + hidden_dims = [256, 256] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = False + normalize_obs = True class reward: class weights: theta = 0.0 omega = 0.0 - equilibrium = 10.0 + equilibrium = 2.0 energy = 1.0 dof_vel = 0.0 - torques = 0.001 + torques = 0.01 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 100 + initial_fill = 10**3 storage_size = 10**6 # 17 batch_size = 256 # 4096 max_gradient_steps = 1 # 10 # SB3: 1 @@ -105,7 +105,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates + max_iterations = 10000 # number of policy updates algorithm_class_name = "SAC" - save_interval = 50 + save_interval = 250 num_steps_per_env = 1 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index c079c1f6..f7bd7510 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -90,6 +90,13 @@ def __init__( # self.ent_coef = "fixed" # self.target_entropy = "fixed" + self.test_input = torch.randn(256, 3, device=self.device) + self.test_actions = torch.zeros(256, 1, device=self.device) + self.test_action_mean = torch.zeros(256, device=self.device) + self.test_action_std = torch.zeros(256, device=self.device) + self.test_action_max = torch.zeros(256, device=self.device) + self.test_action_min = torch.zeros(256, device=self.device) + @property def alpha(self): return self.log_alpha.exp() @@ -112,8 +119,6 @@ def act(self, obs): mean, std = self.actor.forward(obs, deterministic=False) distribution = torch.distributions.Normal(mean, std) actions = distribution.rsample() - - ## * self._scale_actions(actions, intermediate=True) actions_normalized = torch.tanh(actions) # RSL also does a resahpe(-1, self.action_size), not sure why actions_scaled = ( @@ -121,6 +126,14 @@ def act(self, obs): ).clamp(self.action_min, self.action_max) return actions_scaled + def act_inference(self, obs): + mean = self.actor.forward(obs, deterministic=True) + actions_normalized = torch.tanh(mean) + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + return actions_scaled + def update(self, data): generator = create_uniform_generator( data, @@ -150,7 +163,12 @@ def update(self, data): self.mean_alpha_loss /= count self.mean_critic_1_loss /= count self.mean_critic_2_loss /= count - + with torch.inference_mode(): + self.test_actions = self.act_inference(self.test_input).cpu() + self.test_action_mean = self.test_actions.mean().item() + self.test_action_std = self.test_actions.std().item() + self.test_action_max = self.test_actions.max().item() + self.test_action_min = self.test_actions.min().item() return None def update_critic(self, batch): diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index f2cad4de..1addc83d 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -12,10 +12,10 @@ def __init__( num_obs, num_actions, hidden_dims, - activation="tanh", - log_std_max: float = 4.0, - log_std_min: float = -20.0, - std_init: float = 1.0, + activation, + std_init=1.0, + log_std_max=4.0, + log_std_min=-20.0, normalize_obs=True, **kwargs, ): @@ -33,26 +33,24 @@ def __init__( self.num_actions = num_actions self.latent_NN = create_MLP( - num_obs, - hidden_dims["latent"][-1], - hidden_dims["latent"][:-1], - activation["latent"], + num_inputs=num_obs, + num_outputs=hidden_dims["latent"][-1], + hidden_dims=hidden_dims["latent"][:-1], + activation=activation["latent"], ) self.mean_NN = create_MLP( - hidden_dims["latent"][-1], - num_actions, - hidden_dims["mean"], - activation["mean"], + num_inputs=hidden_dims["latent"][-1], + num_outputs=num_actions, + hidden_dims=hidden_dims["mean"], + activation=activation["mean"], ) self.std_NN = create_MLP( - hidden_dims["latent"][-1], - num_actions, - hidden_dims["std"], - activation["std"], + num_inputs=hidden_dims["latent"][-1], + num_outputs=num_actions, + hidden_dims=hidden_dims["std"], + activation=activation["std"], ) - # nn.init.orthogonal_() - # maybe zap self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) Normal.set_default_validate_args = False diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index e57be9a9..2a7bc66e 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -221,6 +221,11 @@ def set_up_logger(self): "mean_actor_loss", "mean_alpha_loss", "alpha", + # "test_actions", + "test_action_mean", + "test_action_std", + "test_action_max", + "test_action_min", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) diff --git a/scripts/play.py b/scripts/play.py index d832bf69..37882f7e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -14,7 +14,7 @@ def setup(args): env_cfg.push_robots.toggle = False if hasattr(env_cfg, "commands"): env_cfg.commands.resampling_time = 9999 - env_cfg.env.episode_length_s = 5 + env_cfg.env.episode_length_s = 50 env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() env_cfg.init_state.reset_mode = "reset_to_range" From e10c8360a9c17c22b1e88fe5a802ef7cce1eba7a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 26 Apr 2024 16:15:57 -0400 Subject: [PATCH 28/43] burn in normalization before learning --- learning/algorithms/ppo2.py | 2 +- learning/runners/on_policy_runner.py | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..db56ef8c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -58,7 +58,7 @@ def switch_to_train(self): self.actor.train() self.critic.train() - def act(self, obs, critic_obs): + def act(self, obs): return self.actor.act(obs).detach() def update(self, data, last_obs=None): diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..60dff1e9 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,7 +37,7 @@ def learn(self): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), @@ -50,6 +50,10 @@ def learn(self): device=self.device, ) + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -57,7 +61,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -116,6 +120,21 @@ def learn(self): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( From acedd9719defd776d0ff5728926cf31c737c9b0f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 26 Apr 2024 16:23:41 -0400 Subject: [PATCH 29/43] fix configs wrt to normalization, and fixed_base --- gym/envs/base/fixed_robot_config.py | 27 +++++++++---------- gym/envs/base/legged_robot_config.py | 3 +-- gym/envs/cartpole/cartpole_config.py | 2 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 3 ++- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +-- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 ++ gym/envs/pendulum/pendulum_config.py | 9 ++++--- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 8c7427f8..b5b5e88d 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - # only for 'ActorCriticRecurrent': - # rnn_type = 'lstm' - # rnn_hidden_size = 512 - # rnn_num_layers = 1 - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False class noise: - noise = 0.1 # implement as needed, also in your robot class + observation_a = 0.1 # implement as needed, also in your robot class + + class critic: + hidden_dims = [512, 256, 128] + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] class rewards: class weights: diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7e6cabc6..2dd2083d 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -238,13 +238,12 @@ class actor: hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] + normalize_obs = True actions = ["q_des"] disable_actions = False diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index f18d5a3b..6e073d5c 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(FixedRobotCfgPPO.policy): + class actor(FixedRobotCfgPPO.actor): init_noise_std = 1.0 num_layers = 2 num_units = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 28c70123..2f9bf540 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,7 +130,6 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - obs = [ "base_lin_vel", "base_ang_vel", @@ -140,6 +139,7 @@ class actor: "dof_vel", "dof_pos_target", ] + normalize_obs = True actions = ["dof_pos_target"] add_noise = True disable_actions = False @@ -168,6 +168,7 @@ class critic: "dof_vel", "dof_pos_target", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 3c92882d..d89715fc 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,7 +73,6 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True obs = [ "base_ang_vel", "projected_gravity", @@ -82,6 +81,7 @@ class actor: "dof_vel", "phase_obs", ] + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -100,7 +100,6 @@ class critic: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True obs = [ "base_height", "base_lin_vel", @@ -112,6 +111,7 @@ class critic: "phase_obs", "dof_pos_target", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 86a8d606..c7e061a9 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -198,6 +198,7 @@ class actor: "dof_vel", "dof_pos_history", ] + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -226,6 +227,7 @@ class critic: "dof_vel", "dof_pos_history", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..eed5d0a9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -68,6 +68,7 @@ class actor: "dof_pos", "dof_vel", ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False @@ -77,14 +78,14 @@ class noise: dof_vel = 0.0 class critic: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" obs = [ "dof_pos", "dof_vel", ] - hidden_dims = [128, 64, 32] - # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" - standard_critic_nn = True + normalize_obs = True class reward: class weights: From a79fae1d66d2f70dec2ac2af2bc28e182c2d871d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 31 May 2024 16:14:27 -0400 Subject: [PATCH 30/43] ppo2 works again on pendulum (lqr/testing config) --- .gitignore | 1 + gym/envs/base/fixed_robot.py | 8 ++-- gym/envs/pendulum/pendulum.py | 60 ++++++++++++++++++---------- gym/envs/pendulum/pendulum_config.py | 37 +++++++++-------- 4 files changed, 65 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index 0b68375a..6028e3b5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ +*smooth_exploration/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index c3cf0566..73e2fb67 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -568,11 +568,11 @@ def _get_env_origins(self): # TODO: do without terrain # ------------ reward functions---------------- - def _sqrdexp(self, x, scale=1.0): + def _sqrdexp(self, x, sigma=None): """shorthand helper for squared exponential""" - return torch.exp( - -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma - ) + if sigma is None: + sigma = self.cfg.reward_settings.tracking_sigma + return torch.exp(-torch.square(x) / sigma) def _reward_torques(self): """Penalize torques""" diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index c22c034a..c63ffa6c 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,22 +1,35 @@ +from math import sqrt import torch from gym.envs.base.fixed_robot import FixedRobot class Pendulum(FixedRobot): - def _init_buffers(self): - super()._init_buffers() - self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) - - def _post_decimation_step(self): - super()._post_decimation_step() - self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) - - def _reset_system(self, env_ids): - super()._reset_system(env_ids) - self.dof_pos_obs[env_ids] = torch.cat( - [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + def _post_physics_step(self): + """Update all states that are not handled in PhysX""" + super()._post_physics_step() + + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] @@ -30,21 +43,28 @@ def _reward_equilibrium(self): error = torch.abs(self.dof_state) error[:, 0] /= self.scales["dof_pos"] error[:, 1] /= self.scales["dof_vel"] - return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) + return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) # return torch.exp( # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma # ) + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + def _reward_energy(self): - m_pendulum = 1.0 - l_pendulum = 1.0 kinetic_energy = ( - 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + 0.5 + * self.cfg.asset.mass + * self.cfg.asset.length**2 + * torch.square(self.dof_vel[:, 0]) ) potential_energy = ( - m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + self.cfg.asset.mass + * 9.81 + * self.cfg.asset.length + * torch.cos(self.dof_pos[:, 0]) ) - desired_energy = m_pendulum * 9.81 * l_pendulum + desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length energy_error = kinetic_energy + potential_energy - desired_energy - return -(energy_error / desired_energy).pow(2) - # return self._sqrdexp(energy_error / desired_energy) + return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 49534568..739594c3 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 4096 - num_actuators = 1 - episode_length_s = 25.0 + num_envs = 2**12 + num_actuators = 1 # 1 for theta connecting base and pole + episode_length_s = 10.0 class terrain(FixedRobotCfg.terrain): pass @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -27,14 +27,14 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi / 2, torch.pi / 2], + "theta": [-torch.pi, torch.pi], } - dof_vel_range = {"theta": [-1, 1]} + dof_vel_range = {"theta": [-5, 5]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 25 - desired_sim_frequency = 200 + ctrl_frequency = 10 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -44,6 +44,8 @@ class asset(FixedRobotCfg.asset): disable_gravity = False disable_motors = False # all torques set to 0 joint_damping = 0.1 + mass = 1.0 + length = 1.0 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -52,7 +54,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 5.0 + tau_ff = 1.0 class PendulumRunnerCfg(FixedRobotCfgPPO): @@ -63,7 +65,7 @@ class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - + normalize_obs = True obs = [ "dof_pos", "dof_vel", @@ -77,6 +79,8 @@ class noise: dof_vel = 0.0 class critic: + critic_class_name = "" + normalize_obs = True obs = [ "dof_pos", "dof_vel", @@ -90,24 +94,23 @@ class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.05 + energy = 0.0 dof_vel = 0.0 - torques = 0.01 + torques = 0.025 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): # both - gamma = 0.99 - discount_horizon = 2.0 + gamma = 0.95 + # discount_horizon = 2.0 lam = 0.98 # shared max_gradient_steps = 24 # new storage_size = 2**17 # new batch_size = 2**16 # new - clip_param = 0.2 learning_rate = 1.0e-4 max_grad_norm = 1.0 @@ -121,6 +124,6 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates + max_iterations = 200 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 100 From 3f81326c970a6cbf1d7cef0c510b82f5b8da3ad1 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 31 May 2024 16:44:30 -0400 Subject: [PATCH 31/43] revert some pendulum changes, SAC runs now but doesn't learn --- gym/envs/pendulum/pendulum.py | 35 +++++++++--------------- gym/envs/pendulum/pendulum_SAC_config.py | 2 +- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index c63ffa6c..4f111d96 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,36 +1,27 @@ -from math import sqrt import torch from gym.envs.base.fixed_robot import FixedRobot class Pendulum(FixedRobot): - def _post_physics_step(self): - """Update all states that are not handled in PhysX""" - super()._post_physics_step() + def _init_buffers(self): + super()._init_buffers() + self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) + + def _post_decimation_step(self): + super()._post_decimation_step() + self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) + + def _reset_system(self, env_ids): + super()._reset_system(env_ids) + self.dof_pos_obs[env_ids] = torch.cat( + [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + ) def _check_terminations_and_timeouts(self): super()._check_terminations_and_timeouts() self.terminated = self.timed_out - def reset_to_uniform(self, env_ids): - grid_points = int(sqrt(self.num_envs)) - lin_pos = torch.linspace( - self.dof_pos_range[0, 0], - self.dof_pos_range[0, 1], - grid_points, - device=self.device, - ) - lin_vel = torch.linspace( - self.dof_vel_range[0, 0], - self.dof_vel_range[0, 1], - grid_points, - device=self.device, - ) - grid = torch.cartesian_prod(lin_pos, lin_vel) - self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) - self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) - def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index aa182e15..2ccb4eab 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -9,7 +9,7 @@ class env(PendulumCfg.env): episode_length_s = 2.5 class init_state(PendulumCfg.init_state): - reset_mode = "reset_to_basic" + reset_mode = "reset_to_range" default_joint_angles = {"theta": 0.0} dof_pos_range = { "theta": [-torch.pi / 2, torch.pi / 2], From 9278807df5afa20fe6c7553a92bfe4b8c5b4b1b3 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 5 Jun 2024 14:11:54 -0400 Subject: [PATCH 32/43] sac pendulum converges somewhat (bugfixes + new rewards) --- gym/envs/pendulum/pendulum.py | 20 +++++++--- gym/envs/pendulum/pendulum_SAC_config.py | 51 ++++++++++++------------ gym/envs/pendulum/pendulum_config.py | 2 +- learning/algorithms/sac.py | 11 ++--- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 4f111d96..84aa0b54 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,4 +1,5 @@ import torch +import numpy as np from gym.envs.base.fixed_robot import FixedRobot @@ -23,12 +24,16 @@ def _check_terminations_and_timeouts(self): self.terminated = self.timed_out def _reward_theta(self): - theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] - return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + theta_norm = self._normalize_theta() + return -theta_norm.pow(2).item() + # theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] + # return self._sqrdexp(theta_rwd.squeeze(dim=-1)) def _reward_omega(self): - omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) - return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + omega = self.dof_vel[:, 0] + return -omega.pow(2).item() + # omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + # return self._sqrdexp(omega_rwd.squeeze(dim=-1)) def _reward_equilibrium(self): error = torch.abs(self.dof_state) @@ -41,7 +46,8 @@ def _reward_equilibrium(self): def _reward_torques(self): """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + return -self.torques.pow(2).item() + # return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) def _reward_energy(self): kinetic_energy = ( @@ -59,3 +65,7 @@ def _reward_energy(self): desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length energy_error = kinetic_energy + potential_energy - desired_energy return self._sqrdexp(energy_error / desired_energy) + + def _normalize_theta(self): + theta = self.dof_pos[:, 0] + return ((theta + np.pi) % (2 * np.pi)) - np.pi diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 2ccb4eab..8b0d1f4a 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -5,20 +5,20 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): - num_envs = 256 - episode_length_s = 2.5 + num_envs = 1 + episode_length_s = 4 class init_state(PendulumCfg.init_state): reset_mode = "reset_to_range" default_joint_angles = {"theta": 0.0} dof_pos_range = { - "theta": [-torch.pi / 2, torch.pi / 2], + "theta": [-torch.pi, torch.pi], } dof_vel_range = {"theta": [-1, 1]} class control(PendulumCfg.control): - ctrl_frequency = 25 - desired_sim_frequency = 200 + ctrl_frequency = 20 + desired_sim_frequency = 100 class asset(PendulumCfg.asset): joint_damping = 0.1 @@ -38,9 +38,9 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = { - "latent": [400], - "mean": [300], - "std": [300], + "latent": [128, 64], + "mean": [32], + "std": [32], } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = { @@ -66,46 +66,47 @@ class critic: "dof_pos_obs", "dof_vel", ] - hidden_dims = [256, 256] + hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True + # TODO[lm]: Current normalization uses torch.no_grad, this should be changed + normalize_obs = False class reward: class weights: - theta = 0.0 - omega = 0.0 - equilibrium = 2.0 + theta = 1.0 + omega = 0.1 + equilibrium = 0.0 energy = 1.0 dof_vel = 0.0 - torques = 0.01 + torques = 0.001 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 10**3 + initial_fill = 500 storage_size = 10**6 # 17 batch_size = 256 # 4096 max_gradient_steps = 1 # 10 # SB3: 1 - action_max = 1.0 - action_min = -1.0 + action_max = 2.0 + action_min = -2.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 - alpha = 0.8 + alpha = 0.2 target_entropy = -1.0 max_grad_norm = 1.0 - polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) - gamma = 0.98 - alpha_lr = 3e-5 - actor_lr = 3e-5 - critic_lr = 3e-5 + polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 3e-4 + critic_lr = 3e-4 class runner(FixedRobotCfgPPO.runner): run_name = "" - experiment_name = "pendulum" - max_iterations = 10000 # number of policy updates + experiment_name = "sac_pendulum" + max_iterations = 30_000 # number of policy updates algorithm_class_name = "SAC" save_interval = 250 num_steps_per_env = 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 739594c3..89010ebf 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -94,7 +94,7 @@ class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.5 dof_vel = 0.0 torques = 0.025 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index f7bd7510..8c57ca4c 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -183,10 +183,10 @@ def update_critic(self, batch): # * self._sample_action(actor_next_obs) mean, std = self.actor.forward(next_actor_obs, deterministic=False) distribution = torch.distributions.Normal(mean, std) - actions = distribution.rsample() + next_actions = distribution.rsample() ## * self._scale_actions(actions, intermediate=True) - actions_normalized = torch.tanh(actions) + actions_normalized = torch.tanh(next_actions) # RSL also does a resahpe(-1, self.action_size), not sure why actions_scaled = ( actions_normalized * self.action_delta + self.action_offset @@ -196,7 +196,7 @@ def update_critic(self, batch): # 1.0 - actions_normalized.pow(2) + 1e-6 # ).sum(-1) action_logp = ( - distribution.log_prob(actions) + distribution.log_prob(next_actions) - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) ).sum(-1) @@ -253,8 +253,9 @@ def update_actor_and_alpha(self, batch): actions_normalized * self.action_delta + self.action_offset ).clamp(self.action_min, self.action_max) ## * - action_logp = distribution.log_prob(actions).sum(-1) - torch.log( - 1.0 - actions_normalized.pow(2) + 1e-6 + action_logp = ( + distribution.log_prob(actions) + - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) ).sum(-1) # * returns target_action = actions_scaled, target_action_logp = action_logp From a6d4adfda3122b745dc846b5439b889966a39fff Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 5 Jun 2024 15:24:53 -0400 Subject: [PATCH 33/43] increase LR (all pendulums balance now) --- gym/envs/pendulum/pendulum_SAC_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 8b0d1f4a..a15ae9de 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -100,13 +100,13 @@ class algorithm(FixedRobotCfgPPO.algorithm): polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.99 alpha_lr = 1e-4 - actor_lr = 3e-4 - critic_lr = 3e-4 + actor_lr = 5e-4 + critic_lr = 5e-4 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" max_iterations = 30_000 # number of policy updates algorithm_class_name = "SAC" - save_interval = 250 + save_interval = 5000 num_steps_per_env = 1 From 02611d026aa73d8c6e55d1d6458aa3de5b2c7294 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Thu, 6 Jun 2024 15:11:26 -0400 Subject: [PATCH 34/43] increase LR and episode len --- gym/envs/pendulum/pendulum_SAC_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index a15ae9de..c17494ef 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -6,7 +6,7 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): num_envs = 1 - episode_length_s = 4 + episode_length_s = 10 class init_state(PendulumCfg.init_state): reset_mode = "reset_to_range" @@ -100,13 +100,13 @@ class algorithm(FixedRobotCfgPPO.algorithm): polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.99 alpha_lr = 1e-4 - actor_lr = 5e-4 - critic_lr = 5e-4 + actor_lr = 1e-3 + critic_lr = 1e-3 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" - max_iterations = 30_000 # number of policy updates + max_iterations = 40_000 # number of policy updates algorithm_class_name = "SAC" save_interval = 5000 num_steps_per_env = 1 From dd9704e247110147cb1955bc680a6c9657823533 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 7 Jun 2024 16:42:45 -0400 Subject: [PATCH 35/43] added SAC mini cheetah and refactorings --- gym/envs/__init__.py | 7 + .../mini_cheetah/mini_cheetah_SAC_config.py | 184 ++++++++++++++++++ learning/algorithms/sac.py | 48 +---- learning/runners/off_policy_runner.py | 5 - 4 files changed, 194 insertions(+), 50 deletions(-) create mode 100644 gym/envs/mini_cheetah/mini_cheetah_SAC_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index f57f4247..f9d8b788 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -27,6 +27,7 @@ "MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahSACCfg": ".mini_cheetah.mini_cheetah_SAC_config", "MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config", "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", @@ -40,6 +41,7 @@ "MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahSACRunnerCfg": ".mini_cheetah.mini_cheetah_SAC_config", "MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config", "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", @@ -61,6 +63,11 @@ "MiniCheetahOscCfg", "MiniCheetahOscRunnerCfg", ], + "sac_mini_cheetah": [ + "MiniCheetahRef", + "MiniCheetahSACCfg", + "MiniCheetahSACRunnerCfg" + ], "humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"], "humanoid_running": [ "HumanoidRunning", diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py new file mode 100644 index 00000000..7f2b3a1d --- /dev/null +++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py @@ -0,0 +1,184 @@ +from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.mini_cheetah.mini_cheetah_ref_config import MiniCheetahRefCfg + +BASE_HEIGHT_REF = 0.3 + + +class MiniCheetahSACCfg(MiniCheetahRefCfg): + class env(MiniCheetahRefCfg.env): + num_envs = 1 + num_actuators = 12 + episode_length_s = 4 # TODO + + class terrain(MiniCheetahRefCfg.terrain): + pass + + class init_state(MiniCheetahRefCfg.init_state): + pass + + class control(MiniCheetahRefCfg.control): + # * PD Drive parameters: + stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} + damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} + gait_freq = 3.0 + ctrl_frequency = 20 # TODO + desired_sim_frequency = 100 + + class commands(MiniCheetahRefCfg.commands): + pass + + class push_robots(MiniCheetahRefCfg.push_robots): + pass + + class domain_rand(MiniCheetahRefCfg.domain_rand): + pass + + class asset(MiniCheetahRefCfg.asset): + file = ( + "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + + "mini_cheetah/urdf/mini_cheetah_simple.urdf" + ) + foot_name = "foot" + penalize_contacts_on = ["shank"] + terminate_after_contacts_on = ["base"] + end_effector_names = ["foot"] + collapse_fixed_joints = False + self_collisions = 1 + flip_visual_attachments = False + disable_gravity = False + disable_motors = False + joint_damping = 0.1 + rotor_inertia = [0.002268, 0.002268, 0.005484] * 4 + + class reward_settings(MiniCheetahRefCfg.reward_settings): + soft_dof_pos_limit = 0.9 + soft_dof_vel_limit = 0.9 + soft_torque_limit = 0.9 + max_contact_force = 600.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 + + class scaling(MiniCheetahRefCfg.scaling): + base_ang_vel = 0.3 + base_lin_vel = BASE_HEIGHT_REF + dof_vel = 4 * [2.0, 2.0, 4.0] + base_height = 0.3 + dof_pos = 4 * [0.2, 0.3, 0.3] + dof_pos_obs = dof_pos + dof_pos_target = 4 * [0.2, 0.3, 0.3] + tau_ff = 4 * [18, 18, 28] + commands = [3, 1, 3] + + +class MiniCheetahSACRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "OffPolicyRunner" + + class actor: + hidden_dims = { + "latent": [128, 128], + "mean": [64], + "std": [64], + } + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = { + "latent": "elu", + "mean": "elu", + "std": "elu", + } + + # TODO[lm]: Handle normalization + normalize_obs = False + obs = [ + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + ] + + actions = ["dof_pos_target"] + add_noise = False # TODO + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [128, 128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + # TODO[lm]: Handle normalization + normalize_obs = False + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + "dof_pos_target", + ] + + class reward: + class weights: + tracking_lin_vel = 4.0 + tracking_ang_vel = 2.0 + lin_vel_z = 0.0 + ang_vel_xy = 0.01 + orientation = 1.0 + torques = 5.0e-7 + dof_vel = 0.0 + min_base_height = 1.5 + collision = 0.0 + action_rate = 0.01 + action_rate2 = 0.001 + stand_still = 0.0 + dof_pos_limits = 0.0 + feet_contact_forces = 0.0 + dof_near_home = 0.0 + reference_traj = 1.5 + swing_grf = 1.5 + stance_grf = 1.5 + + class termination_weight: + termination = 0.15 + + class algorithm(FixedRobotCfgPPO.algorithm): + # Taken from SAC pendulum + initial_fill = 500 + storage_size = 10**6 + batch_size = 256 + max_gradient_steps = 1 # 10 + action_max = 1.0 # TODO + action_min = -1.0 # TODO + actor_noise_std = 0.5 # TODO + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -12.0 # -action_dim + max_grad_norm = 1.0 + polyak = 0.995 # flipped compared to SB3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 1e-4 + critic_lr = 1e-4 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "sac_mini_cheetah" + max_iterations = 50_000 + algorithm_class_name = "SAC" + save_interval = 10_000 + num_steps_per_env = 1 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 8c57ca4c..e0651d2d 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -27,26 +27,14 @@ def __init__( max_grad_norm=1.0, polyak=0.995, gamma=0.99, - # clip_param=0.2, # * PPO - # gamma=0.998, - # lam=0.95, - # entropy_coef=0.0, actor_lr=1e-4, critic_lr=1e-4, - # max_grad_norm=1.0, - # use_clipped_value_loss=True, - # schedule="fixed", - # desired_kl=0.01, device="cpu", **kwargs, ): self.device = device - # self.desired_kl = desired_kl - # self.schedule = schedule - # self.lr = lr - - # * PPO components + # * SAC components self.actor = actor.to(self.device) self.critic_1 = critic_1.to(self.device) self.critic_2 = critic_2.to(self.device) @@ -74,29 +62,14 @@ def __init__( target_entropy if target_entropy else -self.actor.num_actions ) - # * PPO parameters - # self.clip_param = clip_param - # self.batch_size = batch_size - self.max_gradient_steps = max_gradient_steps - # self.entropy_coef = entropy_coef - # self.gamma = gamma - # self.lam = lam - # self.max_grad_norm = max_grad_norm - # self.use_clipped_value_loss = use_clipped_value_loss # * SAC parameters + self.max_gradient_steps = max_gradient_steps self.batch_size = batch_size self.polyak = polyak self.gamma = gamma # self.ent_coef = "fixed" # self.target_entropy = "fixed" - self.test_input = torch.randn(256, 3, device=self.device) - self.test_actions = torch.zeros(256, 1, device=self.device) - self.test_action_mean = torch.zeros(256, device=self.device) - self.test_action_std = torch.zeros(256, device=self.device) - self.test_action_max = torch.zeros(256, device=self.device) - self.test_action_min = torch.zeros(256, device=self.device) - @property def alpha(self): return self.log_alpha.exp() @@ -163,12 +136,7 @@ def update(self, data): self.mean_alpha_loss /= count self.mean_critic_1_loss /= count self.mean_critic_2_loss /= count - with torch.inference_mode(): - self.test_actions = self.act_inference(self.test_input).cpu() - self.test_action_mean = self.test_actions.mean().item() - self.test_action_std = self.test_actions.std().item() - self.test_action_max = self.test_actions.max().item() - self.test_action_min = self.test_actions.min().item() + return None def update_critic(self, batch): @@ -192,9 +160,6 @@ def update_critic(self, batch): actions_normalized * self.action_delta + self.action_offset ).clamp(self.action_min, self.action_max) ## * - # action_logp = distribution.log_prob(actions).sum(-1) - torch.log( - # 1.0 - actions_normalized.pow(2) + 1e-6 - # ).sum(-1) action_logp = ( distribution.log_prob(next_actions) - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) @@ -267,13 +232,6 @@ def update_actor_and_alpha(self, batch): self.log_alpha * (action_logp + self.target_entropy).detach() ).mean() - # alpha_loss = ( - # -self.log_alpha * (action_logp + self.target_entropy).detach() - # ).mean() - # alpha_loss = ( - # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() - # ) - self.log_alpha_optimizer.zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 2a7bc66e..e57be9a9 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -221,11 +221,6 @@ def set_up_logger(self): "mean_actor_loss", "mean_alpha_loss", "alpha", - # "test_actions", - "test_action_mean", - "test_action_std", - "test_action_max", - "test_action_min", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) From 65eb25339402f7efadff6c661901bdba5b3a7e57 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 12 Jun 2024 10:05:59 -0400 Subject: [PATCH 36/43] PPO and SAC work on same rewards and config --- gym/envs/pendulum/pendulum.py | 27 ++++++++++-------------- gym/envs/pendulum/pendulum_SAC_config.py | 14 ++++++------ gym/envs/pendulum/pendulum_config.py | 8 +++---- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 84aa0b54..d935b7b8 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -24,30 +24,24 @@ def _check_terminations_and_timeouts(self): self.terminated = self.timed_out def _reward_theta(self): - theta_norm = self._normalize_theta() - return -theta_norm.pow(2).item() - # theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] - # return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling + return self._sqrdexp(theta_rwd.squeeze(dim=-1)) def _reward_omega(self): - omega = self.dof_vel[:, 0] - return -omega.pow(2).item() - # omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) - # return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + return self._sqrdexp(omega_rwd.squeeze(dim=-1)) def _reward_equilibrium(self): - error = torch.abs(self.dof_state) - error[:, 0] /= self.scales["dof_pos"] - error[:, 1] /= self.scales["dof_vel"] + theta_norm = self._normalize_theta() + omega = self.dof_vel[:, 0] + error = torch.stack( + [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 + ) return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) - # return torch.exp( - # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma - # ) def _reward_torques(self): """Penalize torques""" - return -self.torques.pow(2).item() - # return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) def _reward_energy(self): kinetic_energy = ( @@ -67,5 +61,6 @@ def _reward_energy(self): return self._sqrdexp(energy_error / desired_energy) def _normalize_theta(self): + # normalize to range [-pi, pi] theta = self.dof_pos[:, 0] return ((theta + np.pi) % (2 * np.pi)) - np.pi diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index c17494ef..48488583 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -14,10 +14,10 @@ class init_state(PendulumCfg.init_state): dof_pos_range = { "theta": [-torch.pi, torch.pi], } - dof_vel_range = {"theta": [-1, 1]} + dof_vel_range = {"theta": [-5, 5]} class control(PendulumCfg.control): - ctrl_frequency = 20 + ctrl_frequency = 10 desired_sim_frequency = 100 class asset(PendulumCfg.asset): @@ -74,12 +74,12 @@ class critic: class reward: class weights: - theta = 1.0 - omega = 0.1 - equilibrium = 0.0 - energy = 1.0 + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.5 dof_vel = 0.0 - torques = 0.001 + torques = 0.025 class termination_weight: termination = 0.0 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 89010ebf..24942825 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -65,9 +65,9 @@ class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - normalize_obs = True + normalize_obs = False obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] @@ -80,9 +80,9 @@ class noise: class critic: critic_class_name = "" - normalize_obs = True + normalize_obs = False obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] hidden_dims = [128, 64, 32] From 2ccfd8f90846e2411c3e23dcd7f9900eb063673e Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 12 Jun 2024 10:38:09 -0400 Subject: [PATCH 37/43] refactorings --- gym/envs/pendulum/pendulum.py | 4 ---- gym/envs/pendulum/pendulum_config.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index d935b7b8..47d23eb5 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -19,10 +19,6 @@ def _reset_system(self, env_ids): [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 ) - def _check_terminations_and_timeouts(self): - super()._check_terminations_and_timeouts() - self.terminated = self.timed_out - def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 24942825..bc1cdea7 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**12 - num_actuators = 1 # 1 for theta connecting base and pole - episode_length_s = 10.0 + num_envs = 4096 + num_actuators = 1 + episode_length_s = 10 class terrain(FixedRobotCfg.terrain): pass @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -65,6 +65,8 @@ class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" + + # TODO[lm]: Handle normalization in SAC, then also use it here again normalize_obs = False obs = [ "dof_pos_obs", @@ -80,6 +82,8 @@ class noise: class critic: critic_class_name = "" + + # TODO[lm]: Handle normalization in SAC, then also use it here again normalize_obs = False obs = [ "dof_pos_obs", @@ -126,4 +130,4 @@ class runner(FixedRobotCfgPPO.runner): experiment_name = "pendulum" max_iterations = 200 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 100 + num_steps_per_env = 32 From 49ddec29aa12f6e37dee9259ca781f130f440211 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 14 Jun 2024 14:52:01 -0400 Subject: [PATCH 38/43] fixes --- .gitignore | 1 - gym/envs/base/fixed_robot.py | 8 ++++---- gym/envs/mini_cheetah/mini_cheetah_SAC_config.py | 9 ++++----- gym/envs/pendulum/pendulum.py | 4 ++-- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 6028e3b5..0b68375a 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,6 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ -*smooth_exploration/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 73e2fb67..c3cf0566 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -568,11 +568,11 @@ def _get_env_origins(self): # TODO: do without terrain # ------------ reward functions---------------- - def _sqrdexp(self, x, sigma=None): + def _sqrdexp(self, x, scale=1.0): """shorthand helper for squared exponential""" - if sigma is None: - sigma = self.cfg.reward_settings.tracking_sigma - return torch.exp(-torch.square(x) / sigma) + return torch.exp( + -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma + ) def _reward_torques(self): """Penalize torques""" diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py index 7f2b3a1d..69da4cf8 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py @@ -1,4 +1,4 @@ -from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.base.legged_robot_config import LeggedRobotRunnerCfg from gym.envs.mini_cheetah.mini_cheetah_ref_config import MiniCheetahRefCfg BASE_HEIGHT_REF = 0.3 @@ -7,7 +7,6 @@ class MiniCheetahSACCfg(MiniCheetahRefCfg): class env(MiniCheetahRefCfg.env): num_envs = 1 - num_actuators = 12 episode_length_s = 4 # TODO class terrain(MiniCheetahRefCfg.terrain): @@ -70,7 +69,7 @@ class scaling(MiniCheetahRefCfg.scaling): commands = [3, 1, 3] -class MiniCheetahSACRunnerCfg(FixedRobotCfgPPO): +class MiniCheetahSACRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OffPolicyRunner" @@ -155,7 +154,7 @@ class weights: class termination_weight: termination = 0.15 - class algorithm(FixedRobotCfgPPO.algorithm): + class algorithm(LeggedRobotRunnerCfg.algorithm): # Taken from SAC pendulum initial_fill = 500 storage_size = 10**6 @@ -175,7 +174,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): actor_lr = 1e-4 critic_lr = 1e-4 - class runner(FixedRobotCfgPPO.runner): + class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "sac_mini_cheetah" max_iterations = 50_000 diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 47d23eb5..39c8b331 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -33,11 +33,11 @@ def _reward_equilibrium(self): error = torch.stack( [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 ) - return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) + return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) def _reward_torques(self): """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) def _reward_energy(self): kinetic_energy = ( From 5025875bb212df977584d523c577a1dc6c169c88 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sat, 15 Jun 2024 15:22:36 -0400 Subject: [PATCH 39/43] visualize value function for PPO --- gym/envs/base/fixed_robot.py | 4 +- gym/envs/base/fixed_robot_config.py | 1 + gym/envs/base/task_skeleton.py | 1 + gym/envs/pendulum/pendulum.py | 29 ++++-- gym/envs/pendulum/pendulum_config.py | 6 +- learning/modules/lqrc/plotting.py | 130 +++++++++++++++++++++++++++ learning/runners/BaseRunner.py | 1 + learning/runners/on_policy_runner.py | 36 ++++++-- learning/utils/dict_utils.py | 13 ++- scripts/train.py | 4 +- scripts/visualize_value_function.py | 90 +++++++++++++++++++ 11 files changed, 296 insertions(+), 19 deletions(-) create mode 100644 learning/modules/lqrc/plotting.py create mode 100644 scripts/visualize_value_function.py diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index c3cf0566..bbfab6d3 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -401,8 +401,8 @@ def _init_buffers(self): self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) # * check that init range highs and lows are consistent # * and repopulate to match - if self.cfg.init_state.reset_mode == "reset_to_range": - self.initialize_ranges_for_initial_conditions() + # if self.cfg.init_state.reset_mode == "reset_to_range": + self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): self.dof_pos_range = torch.zeros( diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index a9bdef47..ecf12dae 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -193,6 +193,7 @@ class runner: # * logging # * check for potential saves every this many iterations save_interval = 50 + log_storage = False run_name = "" experiment_name = "fixed_robot" diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index 6a1b352c..0974970a 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -45,6 +45,7 @@ def reset(self): """Reset all robots""" self._reset_idx(torch.arange(self.num_envs, device=self.device)) self.step() + self.episode_length_buf[:] = 0 def _reset_buffers(self): self.to_be_reset[:] = False diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 39c8b331..d0bcfb9c 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,5 +1,6 @@ import torch import numpy as np +from math import sqrt from gym.envs.base.fixed_robot import FixedRobot @@ -19,6 +20,28 @@ def _reset_system(self, env_ids): [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 ) + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, + ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) + def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling return self._sqrdexp(theta_rwd.squeeze(dim=-1)) @@ -33,11 +56,7 @@ def _reward_equilibrium(self): error = torch.stack( [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 ) - return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) - - def _reward_torques(self): - """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) + return self._sqrdexp(torch.mean(error, dim=1), scale=0.2) def _reward_energy(self): kinetic_energy = ( diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 0fb705ae..05b00e33 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_uniform" # * initial conditions for reset_to_range dof_pos_range = { @@ -129,4 +129,6 @@ class runner(FixedRobotCfgPPO.runner): experiment_name = "pendulum" max_iterations = 200 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 100 + save_interval = 50 + log_storage = True diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py new file mode 100644 index 00000000..f5927b24 --- /dev/null +++ b/learning/modules/lqrc/plotting.py @@ -0,0 +1,130 @@ +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +from matplotlib.colors import ListedColormap + +import numpy as np + + +def create_custom_bwr_colormap(): + # Define the colors for each segment + dark_blue = [0, 0, 0.5, 1] + light_blue = [0.5, 0.5, 1, 1] + white = [1, 1, 1, 1] + light_red = [1, 0.5, 0.5, 1] + dark_red = [0.5, 0, 0, 1] + + # Number of bins for each segment + n_bins = 128 + mid_band = 5 + + # Create the colormap segments + blue_segment = np.linspace(dark_blue, light_blue, n_bins // 2) + white_segment = np.tile(white, (mid_band, 1)) + red_segment = np.linspace(light_red, dark_red, n_bins // 2) + + # Stack segments to create the full colormap + colors = np.vstack((blue_segment, white_segment, red_segment)) + custom_bwr = ListedColormap(colors, name="custom_bwr") + + return custom_bwr + + +def plot_pendulum_multiple_critics_w_data( + x, predictions, targets, title, fn, data, colorbar_label="f(x)", grid_size=64 +): + num_critics = len(x.keys()) + fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(4 * num_critics, 6)) + + # Determine global min and max error for consistent scaling + global_min_error = float("inf") + global_max_error = float("-inf") + global_min_prediction = float("inf") + global_max_prediction = float("-inf") + prediction_cmap = mpl.cm.get_cmap("viridis") + error_cmap = create_custom_bwr_colormap() + + for critic_name in x: + np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) + np_targets = ( + targets["Ground Truth MC Returns"].detach().cpu().numpy().reshape(-1) + ) + np_error = np_predictions - np_targets + global_min_error = min(global_min_error, np.min(np_error)) + global_max_error = max(global_max_error, np.max(np_error)) + global_min_prediction = np.min(np_targets) + global_max_prediction = np.max(np_targets) + error_norm = mcolors.TwoSlopeNorm( + vmin=global_min_error, vcenter=0, vmax=global_max_error + ) + prediction_norm = mcolors.CenteredNorm( + vcenter=(global_max_prediction + global_min_prediction) / 2, + halfrange=(global_max_prediction - global_min_prediction) / 2, + ) + + xcord = np.linspace(-2 * np.pi, 2 * np.pi, grid_size) + ycord = np.linspace(-5, 5, grid_size) + + for ix, critic_name in enumerate(x): + np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) + np_targets = ( + targets["Ground Truth MC Returns"].detach().cpu().numpy().reshape(-1) + ) + np_error = np_predictions - np_targets + + # Predictions + axes[0, ix].imshow( + np_predictions.reshape(grid_size, grid_size).T, + origin="lower", + extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), + cmap=prediction_cmap, + norm=prediction_norm, + ) + axes[0, ix].set_title(f"{critic_name} Prediction") + + if ix == 0: + continue + + # Errors + axes[1, ix].imshow( + np_error.reshape(grid_size, grid_size).T, + origin="lower", + extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), + cmap=error_cmap, + norm=error_norm, + ) + axes[1, ix].set_title(f"{critic_name} Error") + + # Trajectories + data = data.detach().cpu().numpy() + theta = data[:, :, 0] + omega = data[:, :, 1] + axes[1, 0].plot(theta, omega, lw=1) + axes[1, 0].set_xlabel("theta") + axes[1, 0].set_ylabel("theta_dot") + fig.suptitle(title, fontsize=16) + + # Ensure the axes are the same for all plots + for ax in axes.flat: + ax.set_xlim([xcord.min(), xcord.max()]) + ax.set_ylim([ycord.min(), ycord.max()]) + + plt.subplots_adjust( + top=0.9, bottom=0.1, left=0.1, right=0.9, hspace=0.4, wspace=0.3 + ) + + fig.colorbar( + mpl.cm.ScalarMappable(norm=prediction_norm, cmap=prediction_cmap), + ax=axes[0, :].ravel().tolist(), + shrink=0.95, + label=colorbar_label, + ) + fig.colorbar( + mpl.cm.ScalarMappable(norm=error_norm, cmap=error_cmap), + ax=axes[1, :].ravel().tolist(), + shrink=0.95, + label=colorbar_label, + ) + + plt.savefig(f"{fn}.png") + print(f"Saved to {fn}.png") diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9e5ab2e3..698fdf1d 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -10,6 +10,7 @@ def __init__(self, env, train_cfg, device="cpu"): self.env = env self.parse_train_cfg(train_cfg) + self.log_storage = self.cfg["log_storage"] self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index c1758000..b7fc3722 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -6,6 +6,7 @@ from .BaseRunner import BaseRunner from learning.storage import DictStorage +from learning.utils import export_to_numpy logger = Logger() storage = DictStorage() @@ -30,7 +31,6 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) @@ -42,9 +42,19 @@ def learn(self): "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], - "dones": self.get_timed_out(), + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), } ) + if self.log_storage: + transition.update( + { + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, + } + ) + storage.initialize( transition, self.env.num_envs, @@ -52,9 +62,11 @@ def learn(self): device=self.device, ) + self.save() # burn in observation normalization. if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: self.burn_in_normalization() + self.env.reset() logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): @@ -99,9 +111,18 @@ def learn(self): "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, + "terminated": terminated, "dones": dones, } ) + if self.log_storage: + transition.update( + { + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, + } + ) + storage.add_transitions(transition) logger.log_rewards(rewards_dict) @@ -109,6 +130,9 @@ def learn(self): logger.finish_step(dones) logger.toc("collection") + if self.it % self.save_interval == 0: + self.save() + logger.tic("learning") self.alg.update(storage.data) storage.clear() @@ -120,9 +144,7 @@ def learn(self): logger.toc("runtime") logger.print_to_terminal() - if self.it % self.save_interval == 0: - self.save() - self.save() + # self.save() @torch.no_grad def burn_in_normalization(self, n_iterations=100): @@ -179,6 +201,10 @@ def save(self): }, path, ) + if self.log_storage: + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 99534c22..ba9edb6d 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -4,18 +4,25 @@ @torch.no_grad def compute_MC_returns(data: TensorDict, gamma, critic=None): + # todo not as accurate as taking if critic is None: last_values = torch.zeros_like(data["rewards"][0]) else: last_values = critic.evaluate(data["critic_obs"][-1]) returns = torch.zeros_like(data["rewards"]) - returns[-1] = last_values * ~data["dones"][-1] + returns[-1] = data["rewards"][-1] + gamma * last_values * ~data["terminated"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done - - return normalize(returns) + if critic is not None: + returns[k] += ( + gamma + * critic.evaluate(data["critic_obs"][k]) + * data["timed_out"][k] + * ~data["terminated"][k] + ) + return returns @torch.no_grad diff --git a/scripts/train.py b/scripts/train.py index 759992cf..5b55f188 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,5 +1,5 @@ from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils import get_args, task_registry # , randomize_episode_counters from gym.utils.logging_and_saving import wandb_singleton from gym.utils.logging_and_saving import local_code_save_helper @@ -12,7 +12,7 @@ def setup(): task_registry.make_gym_and_sim() wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) + # randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) diff --git a/scripts/visualize_value_function.py b/scripts/visualize_value_function.py new file mode 100644 index 00000000..4f98b572 --- /dev/null +++ b/scripts/visualize_value_function.py @@ -0,0 +1,90 @@ +import matplotlib.pyplot as plt # noqa F401 +from learning.modules.critic import Critic # noqa F401 + +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, +) +from learning.modules.lqrc.plotting import plot_pendulum_multiple_critics_w_data +from gym import LEGGED_GYM_ROOT_DIR +import os +import shutil + +import torch + +DEVICE = "cuda:0" + +# * Setup +experiment_name = "pendulum" +run_name = "obs_no_norm_100" + +log_dir = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", experiment_name, run_name) +plot_dir = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "V_plots", run_name) +os.makedirs(plot_dir, exist_ok=True) + +# * Critic Params +name = "PPO" +n_obs = 3 +hidden_dims = [128, 64, 32] +activation = "tanh" +normalize_obs = False + +# * Params +gamma = 0.95 +lam = 1.0 +num_steps = 10 # ! want this at 1 +n_trajs = 64 +rand_perm = torch.randperm(4096) +traj_idx = rand_perm[0:n_trajs] +test_idx = rand_perm[n_trajs : n_trajs + 1000] + + +for iteration in [50, 100, 150, 200]: + # load data + base_data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to( + DEVICE + ) + + dof_pos = base_data["dof_pos"].detach().clone() + dof_vel = base_data["dof_vel"].detach().clone() + graphing_obs = torch.cat((dof_pos, dof_vel), dim=2) + + # compute ground-truth + graphing_data = {data_name: {} for data_name in ["obs", "values", "returns"]} + + episode_rollouts = compute_MC_returns(base_data, gamma) + print(f"Initializing value offset to: {episode_rollouts.mean().item()}") + graphing_data["obs"]["Ground Truth MC Returns"] = graphing_obs[0, :] + graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + + # load critic + model = torch.load(os.path.join(log_dir, "model_{}.pt".format(iteration))) + critic = Critic(n_obs, hidden_dims, activation, normalize_obs).to(DEVICE) + critic.load_state_dict(model["critic_state_dict"]) + + # compute values and returns + data = base_data.detach().clone() + data["values"] = critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic) + data["returns"] = data["advantages"] + data["values"] + + with torch.no_grad(): + graphing_data["obs"][name] = graphing_obs[0, :] + graphing_data["values"][name] = critic.evaluate(data[0, :]["critic_obs"]) + graphing_data["returns"][name] = data[0, :]["returns"] + + # generate plots + plot_pendulum_multiple_critics_w_data( + graphing_data["obs"], + graphing_data["values"], + graphing_data["returns"], + title=f"iteration{iteration}", + fn=plot_dir + f"/{name}_CRITIC_it{iteration}", + data=graphing_obs[:num_steps, traj_idx], + ) + + plt.close() + +this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_value_function.py") +shutil.copy(this_file, os.path.join(plot_dir, os.path.basename(this_file))) From fa496177ae8dd3e7e710d9d8e20cdee276d96844 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sat, 15 Jun 2024 15:31:08 -0400 Subject: [PATCH 40/43] forgot export_to_numpy in dict_utils --- learning/utils/dict_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index ba9edb6d..873dfdda 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -1,3 +1,4 @@ +import numpy as np import torch from tensordict import TensorDict @@ -84,3 +85,12 @@ def create_uniform_generator( indices[i * batch_size : (i + 1) * batch_size] ] yield batched_data + + +@torch.no_grad +def export_to_numpy(data, path): + # check if path ends iwth ".npz", and if not append it. + if not path.endswith(".npz"): + path += ".npz" + np.savez_compressed(path, **{key: val.cpu().numpy() for key, val in data.items()}) + return From 7b44683d3a4b6c6769803425c7497e4f383f239a Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 17 Jun 2024 15:49:28 -0400 Subject: [PATCH 41/43] visualize SAC critics, update plotting for both SAC and PPO --- gym/envs/pendulum/pendulum_SAC_config.py | 13 +- gym/envs/pendulum/pendulum_config.py | 4 +- learning/modules/lqrc/plotting.py | 63 +++++--- learning/runners/off_policy_runner.py | 35 ++++- ...ize_value_function.py => visualize_ppo.py} | 27 ++-- scripts/visualize_sac.py | 137 ++++++++++++++++++ 6 files changed, 239 insertions(+), 40 deletions(-) rename scripts/{visualize_value_function.py => visualize_ppo.py} (87%) create mode 100644 scripts/visualize_sac.py diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 48488583..e3b3047b 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -5,11 +5,11 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): - num_envs = 1 + num_envs = 1024 episode_length_s = 10 class init_state(PendulumCfg.init_state): - reset_mode = "reset_to_range" + reset_mode = "reset_to_uniform" default_joint_angles = {"theta": 0.0} dof_pos_range = { "theta": [-torch.pi, torch.pi], @@ -85,8 +85,8 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 500 - storage_size = 10**6 # 17 + initial_fill = 0 + storage_size = 100 * 1024 # steps_per_episode * num_envs batch_size = 256 # 4096 max_gradient_steps = 1 # 10 # SB3: 1 action_max = 2.0 @@ -106,7 +106,8 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" - max_iterations = 40_000 # number of policy updates + max_iterations = 30_000 # number of policy updates algorithm_class_name = "SAC" - save_interval = 5000 num_steps_per_env = 1 + save_interval = 2000 + log_storage = True diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 05b00e33..d8086666 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -127,8 +127,8 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 200 # number of policy updates + max_iterations = 100 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 100 - save_interval = 50 + save_interval = 20 log_storage = True diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index f5927b24..1c9b3581 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -31,7 +31,15 @@ def create_custom_bwr_colormap(): def plot_pendulum_multiple_critics_w_data( - x, predictions, targets, title, fn, data, colorbar_label="f(x)", grid_size=64 + x, + predictions, + targets, + title, + fn, + data, + colorbar_label="f(x)", + grid_size=64, + actions=None, ): num_critics = len(x.keys()) fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(4 * num_critics, 6)) @@ -43,6 +51,7 @@ def plot_pendulum_multiple_critics_w_data( global_max_prediction = float("-inf") prediction_cmap = mpl.cm.get_cmap("viridis") error_cmap = create_custom_bwr_colormap() + action_cmap = mpl.cm.get_cmap("viridis") for critic_name in x: np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) @@ -52,15 +61,20 @@ def plot_pendulum_multiple_critics_w_data( np_error = np_predictions - np_targets global_min_error = min(global_min_error, np.min(np_error)) global_max_error = max(global_max_error, np.max(np_error)) - global_min_prediction = np.min(np_targets) - global_max_prediction = np.max(np_targets) + global_min_prediction = min( + global_min_prediction, np.min(np_predictions), np.min(np_targets) + ) + global_max_prediction = max( + global_max_prediction, np.max(np_predictions), np.max(np_targets) + ) error_norm = mcolors.TwoSlopeNorm( - vmin=global_min_error, vcenter=0, vmax=global_max_error + vmin=global_min_error - 1e-5, vcenter=0, vmax=global_max_error + 1e-5 ) prediction_norm = mcolors.CenteredNorm( vcenter=(global_max_prediction + global_min_prediction) / 2, halfrange=(global_max_prediction - global_min_prediction) / 2, ) + action_norm = mcolors.CenteredNorm() xcord = np.linspace(-2 * np.pi, 2 * np.pi, grid_size) ycord = np.linspace(-5, 5, grid_size) @@ -72,7 +86,7 @@ def plot_pendulum_multiple_critics_w_data( ) np_error = np_predictions - np_targets - # Predictions + # Plot Predictions axes[0, ix].imshow( np_predictions.reshape(grid_size, grid_size).T, origin="lower", @@ -85,17 +99,32 @@ def plot_pendulum_multiple_critics_w_data( if ix == 0: continue - # Errors - axes[1, ix].imshow( - np_error.reshape(grid_size, grid_size).T, - origin="lower", - extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), - cmap=error_cmap, - norm=error_norm, - ) - axes[1, ix].set_title(f"{critic_name} Error") - - # Trajectories + if actions is None: + # Plot Errors + axes[1, ix].imshow( + np_error.reshape(grid_size, grid_size).T, + origin="lower", + extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), + cmap=error_cmap, + norm=error_norm, + ) + axes[1, ix].set_title(f"{critic_name} Error") + ax1_mappable = mpl.cm.ScalarMappable(norm=error_norm, cmap=error_cmap) + + else: + # Plot Actions + np_actions = actions[critic_name].detach().cpu().numpy().reshape(-1) + axes[1, ix].imshow( + np_actions.reshape(grid_size, grid_size).T, + origin="lower", + extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), + cmap=action_cmap, + norm=action_norm, + ) + axes[1, ix].set_title(f"{critic_name} Action") + ax1_mappable = mpl.cm.ScalarMappable(norm=action_norm, cmap=action_cmap) + + # Plot MC Trajectories data = data.detach().cpu().numpy() theta = data[:, :, 0] omega = data[:, :, 1] @@ -120,7 +149,7 @@ def plot_pendulum_multiple_critics_w_data( label=colorbar_label, ) fig.colorbar( - mpl.cm.ScalarMappable(norm=error_norm, cmap=error_cmap), + ax1_mappable, ax=axes[1, :].ravel().tolist(), shrink=0.95, label=colorbar_label, diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index e57be9a9..cc9d2666 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -8,6 +8,7 @@ from learning.modules import Critic, ChimeraActor from learning.storage import ReplayBuffer from learning.algorithms import SAC +from learning.utils import export_to_numpy logger = Logger() storage = ReplayBuffer() @@ -55,7 +56,6 @@ def learn(self): critic_obs = self.get_obs(self.critic_cfg["obs"]) actions = self.alg.act(actor_obs) tot_iter = self.it + self.num_learning_iterations - self.save() # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) @@ -67,9 +67,18 @@ def learn(self): "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], - "dones": self.get_timed_out(), + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), } ) + if self.log_storage: + transition.update( + { + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, + } + ) storage.initialize( transition, self.env.num_envs, @@ -115,9 +124,17 @@ def learn(self): "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, + "terminated": terminated, "dones": dones, } ) + if self.log_storage: + transition.update( + { + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, + } + ) storage.add_transitions(transition) # print every 10% of initial fill if (self.alg_cfg["initial_fill"] > 10) and ( @@ -171,6 +188,13 @@ def learn(self): "dones": dones, } ) + if self.log_storage: + transition.update( + { + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, + } + ) storage.add_transitions(transition) logger.log_rewards(rewards_dict) @@ -190,7 +214,8 @@ def learn(self): if self.it % self.save_interval == 0: self.save() - self.save() + + # self.save() def update_rewards(self, rewards_dict, terminated): rewards_dict.update( @@ -244,6 +269,10 @@ def save(self): "iter": self.it, } torch.save(save_dict, path) + if self.log_storage: + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) diff --git a/scripts/visualize_value_function.py b/scripts/visualize_ppo.py similarity index 87% rename from scripts/visualize_value_function.py rename to scripts/visualize_ppo.py index 4f98b572..19fe6b7c 100644 --- a/scripts/visualize_value_function.py +++ b/scripts/visualize_ppo.py @@ -12,11 +12,12 @@ import torch -DEVICE = "cuda:0" +# DEVICE = "cuda:0" +DEVICE = "cpu" # * Setup experiment_name = "pendulum" -run_name = "obs_no_norm_100" +run_name = "obs_no_norm" log_dir = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", experiment_name, run_name) plot_dir = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "V_plots", run_name) @@ -24,26 +25,28 @@ # * Critic Params name = "PPO" -n_obs = 3 +n_obs = 3 # [dof_pos_obs, dof_vel] hidden_dims = [128, 64, 32] activation = "tanh" normalize_obs = False +n_envs = 4096 # * Params gamma = 0.95 lam = 1.0 num_steps = 10 # ! want this at 1 n_trajs = 64 -rand_perm = torch.randperm(4096) +rand_perm = torch.randperm(n_envs) traj_idx = rand_perm[0:n_trajs] test_idx = rand_perm[n_trajs : n_trajs + 1000] +it_delta = 20 +it_total = 100 +it_range = range(it_delta, it_total + 1, it_delta) -for iteration in [50, 100, 150, 200]: +for it in it_range: # load data - base_data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to( - DEVICE - ) + base_data = torch.load(os.path.join(log_dir, "data_{}.pt".format(it))).to(DEVICE) dof_pos = base_data["dof_pos"].detach().clone() dof_vel = base_data["dof_vel"].detach().clone() @@ -59,7 +62,7 @@ graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] # load critic - model = torch.load(os.path.join(log_dir, "model_{}.pt".format(iteration))) + model = torch.load(os.path.join(log_dir, "model_{}.pt".format(it))) critic = Critic(n_obs, hidden_dims, activation, normalize_obs).to(DEVICE) critic.load_state_dict(model["critic_state_dict"]) @@ -79,12 +82,12 @@ graphing_data["obs"], graphing_data["values"], graphing_data["returns"], - title=f"iteration{iteration}", - fn=plot_dir + f"/{name}_CRITIC_it{iteration}", + title=f"iteration{it}", + fn=plot_dir + f"/PPO_CRITIC_it{it}", data=graphing_obs[:num_steps, traj_idx], ) plt.close() -this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_value_function.py") +this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_ppo.py") shutil.copy(this_file, os.path.join(plot_dir, os.path.basename(this_file))) diff --git a/scripts/visualize_sac.py b/scripts/visualize_sac.py new file mode 100644 index 00000000..a0ac6296 --- /dev/null +++ b/scripts/visualize_sac.py @@ -0,0 +1,137 @@ +import matplotlib.pyplot as plt # noqa F401 +from learning.modules.critic import Critic # noqa F401 + +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, +) +from learning.modules.lqrc.plotting import plot_pendulum_multiple_critics_w_data +from gym import LEGGED_GYM_ROOT_DIR +import os +import shutil +import numpy as np +import torch + +# DEVICE = "cuda:0" +DEVICE = "cpu" + +# * Setup +experiment_name = "sac_pendulum" +run_name = "1024envs" + +log_dir = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", experiment_name, run_name) +plot_dir = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "V_plots_sac", run_name) +os.makedirs(plot_dir, exist_ok=True) + +# * Critic Params +name = "SAC" +n_obs = 4 # [dof_pos_obs, dof_vel, action] +hidden_dims = [128, 64, 32] +activation = "elu" +normalize_obs = False +n_envs = 1024 + +# * Params +gamma = 0.95 +lam = 1.0 +episode_steps = 100 +visualize_steps = 10 # just to show rollouts +n_trajs = 64 +rand_perm = torch.randperm(n_envs) +traj_idx = rand_perm[0:n_trajs] +test_idx = rand_perm[n_trajs : n_trajs + 1000] + +it_delta = 2000 +it_total = 30_000 +it_range = range(it_delta, it_total + 1, it_delta) + +for it in it_range: + # load data + base_data = torch.load(os.path.join(log_dir, "data_{}.pt".format(it))).to(DEVICE) + # TODO: handle buffer differently? + base_data = base_data[-episode_steps:, :] + + dof_pos = base_data["dof_pos"].detach().clone() + dof_vel = base_data["dof_vel"].detach().clone() + graphing_obs = torch.cat((dof_pos, dof_vel), dim=2) + + # compute ground-truth + graphing_data = { + data_name: {} for data_name in ["obs", "values", "returns", "actions"] + } + + episode_rollouts = compute_MC_returns(base_data, gamma) + print(f"Initializing value offset to: {episode_rollouts.mean().item()}") + graphing_data["obs"]["Ground Truth MC Returns"] = graphing_obs[0, :] + graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + + # load model which includes both critics + model = torch.load(os.path.join(log_dir, "model_{}.pt".format(it))) + + # line search to find best action + data = base_data.detach().clone() + data_shape = data["critic_obs"].shape # [a, b, 3] + + # create a tensor of actions to evaluate + N = 41 + actions_space = torch.linspace(-2, 2, N).to(DEVICE) + actions = ( + actions_space.unsqueeze(0).unsqueeze(1).unsqueeze(-1) + ) # Shape: [1, 1, N, 1] + + # repeat the actions for each entry in data + actions = actions.repeat(data_shape[0], data_shape[1], 1, 1) # Shape: [a, b, N, 1] + + # repeat the data for each action + critic_obs = ( + data["critic_obs"].unsqueeze(2).repeat(1, 1, N, 1) + ) # Shape: [a, b, N, 3] + + # concatenate the actions to the data + critic_obs = torch.cat((critic_obs, actions), dim=3) # Shape: [a, b, N, 4] + + # evaluate the critic for all actions and entries + for critic_str in ["critic_1", "critic_2"]: + critic_name = name + " " + critic_str + critic = Critic(n_obs, hidden_dims, activation, normalize_obs).to(DEVICE) + critic.load_state_dict(model[critic_str + "_state_dict"]) + q_values = critic.evaluate(critic_obs) # Shape: [a, b, N] + + # find the best action for each entry + best_actions_idx = torch.argmax(q_values, dim=2) # Shape: [a, b] + best_actions = actions_space[best_actions_idx] # Shape: [a, b] + best_actions = best_actions.unsqueeze(-1) # Shape: [a, b, 1] + + # compute values and returns + best_obs = torch.cat( + (data["critic_obs"], best_actions), dim=2 + ) # Shape: [a, b, 4] + data["values"] = critic.evaluate(best_obs) # Shape: [a, b] + data["next_critic_obs"] = best_obs # needed for GAE + data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic) + data["returns"] = data["advantages"] + data["values"] + + with torch.no_grad(): + graphing_data["obs"][critic_name] = graphing_obs[0, :] + graphing_data["values"][critic_name] = critic.evaluate(best_obs[0, :]) + graphing_data["returns"][critic_name] = data[0, :]["returns"] + graphing_data["actions"][critic_name] = best_actions[0, :] + + # generate plots + grid_size = int(np.sqrt(n_envs)) + plot_pendulum_multiple_critics_w_data( + graphing_data["obs"], + graphing_data["values"], + graphing_data["returns"], + title=f"iteration{it}", + fn=plot_dir + f"/{name}_CRITIC_it{it}", + data=graphing_obs[:visualize_steps, traj_idx], + grid_size=grid_size, + actions=graphing_data["actions"], + ) + + plt.close() + +this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_sac.py") +shutil.copy(this_file, os.path.join(plot_dir, os.path.basename(this_file))) From 39a767f8faed48d86777f9913f6cb9b05077dde0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 2 Jul 2024 15:18:19 -0400 Subject: [PATCH 42/43] slimmed down runner --- gym/envs/base/fixed_robot.py | 3 --- learning/runners/off_policy_runner.py | 28 +++++++-------------------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index bbfab6d3..6d9dc971 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -399,9 +399,6 @@ def _init_buffers(self): self.default_act_pos = self.default_act_pos.unsqueeze(0) # * store indices of actuated joints self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) - # * check that init range highs and lows are consistent - # * and repopulate to match - # if self.cfg.init_state.reset_mode == "reset_to_range": self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index cc9d2666..960112a4 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -70,15 +70,10 @@ def learn(self): "timed_out": self.get_timed_out(), "terminated": self.get_terminated(), "dones": self.get_timed_out() | self.get_terminated(), + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) - if self.log_storage: - transition.update( - { - "dof_pos": self.env.dof_pos, - "dof_vel": self.env.dof_vel, - } - ) storage.initialize( transition, self.env.num_envs, @@ -100,6 +95,8 @@ def learn(self): "actor_obs": actor_obs, "actions": actions, "critic_obs": critic_obs, + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) @@ -128,13 +125,7 @@ def learn(self): "dones": dones, } ) - if self.log_storage: - transition.update( - { - "dof_pos": self.env.dof_pos, - "dof_vel": self.env.dof_vel, - } - ) + storage.add_transitions(transition) # print every 10% of initial fill if (self.alg_cfg["initial_fill"] > 10) and ( @@ -161,6 +152,8 @@ def learn(self): "actor_obs": actor_obs, "actions": actions, "critic_obs": critic_obs, + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) @@ -188,13 +181,6 @@ def learn(self): "dones": dones, } ) - if self.log_storage: - transition.update( - { - "dof_pos": self.env.dof_pos, - "dof_vel": self.env.dof_vel, - } - ) storage.add_transitions(transition) logger.log_rewards(rewards_dict) From 31968b61f5731238ed6c713b64b898ea53cea590 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 3 Jul 2024 16:02:57 -0400 Subject: [PATCH 43/43] adress comments --- scripts/visualize_ppo.py | 9 ++++----- scripts/visualize_sac.py | 5 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/visualize_ppo.py b/scripts/visualize_ppo.py index 19fe6b7c..fd4e8aeb 100644 --- a/scripts/visualize_ppo.py +++ b/scripts/visualize_ppo.py @@ -1,5 +1,5 @@ -import matplotlib.pyplot as plt # noqa F401 -from learning.modules.critic import Critic # noqa F401 +import matplotlib.pyplot as plt +from learning.modules.critic import Critic from learning.utils import ( compute_generalized_advantages, @@ -12,7 +12,6 @@ import torch -# DEVICE = "cuda:0" DEVICE = "cpu" # * Setup @@ -34,7 +33,7 @@ # * Params gamma = 0.95 lam = 1.0 -num_steps = 10 # ! want this at 1 +visualize_steps = 10 # just to show rollouts n_trajs = 64 rand_perm = torch.randperm(n_envs) traj_idx = rand_perm[0:n_trajs] @@ -84,7 +83,7 @@ graphing_data["returns"], title=f"iteration{it}", fn=plot_dir + f"/PPO_CRITIC_it{it}", - data=graphing_obs[:num_steps, traj_idx], + data=graphing_obs[:visualize_steps, traj_idx], ) plt.close() diff --git a/scripts/visualize_sac.py b/scripts/visualize_sac.py index a0ac6296..d24b0d3a 100644 --- a/scripts/visualize_sac.py +++ b/scripts/visualize_sac.py @@ -1,5 +1,5 @@ -import matplotlib.pyplot as plt # noqa F401 -from learning.modules.critic import Critic # noqa F401 +import matplotlib.pyplot as plt +from learning.modules.critic import Critic from learning.utils import ( compute_generalized_advantages, @@ -12,7 +12,6 @@ import numpy as np import torch -# DEVICE = "cuda:0" DEVICE = "cpu" # * Setup