From e19ec214976a2d4d34b52ad8a8fc6e43c1f1a833 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 16 Feb 2024 13:30:46 -0500 Subject: [PATCH 01/98] add new PPO2, for now just split up actor and critic. Tested, 100% the same as PPO. --- learning/algorithms/__init__.py | 1 + learning/algorithms/ppo2.py | 232 ++++++++++++++++++++++++++++++++ learning/runners/BaseRunner.py | 2 +- 3 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 learning/algorithms/ppo2.py diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index 7827379b..ace1b9fe 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -31,4 +31,5 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .ppo import PPO +from .ppo2 import PPO2 from .SE import StateEstimator diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py new file mode 100644 index 00000000..6649f396 --- /dev/null +++ b/learning/algorithms/ppo2.py @@ -0,0 +1,232 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2021 ETH Zurich, Nikita Rudin + +import torch +import torch.nn as nn +import torch.optim as optim + +# from learning.modules import ActorCritic +from learning.storage import RolloutStorage + + +class PPO2: + # actor_critic: ActorCritic + + def __init__( + self, + actor_critic, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + value_loss_coef=1.0, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + # todo keeping actor_critic for loading code in runner + self.actor_critic = actor_critic.to(self.device) + self.actor = actor_critic.actor.to(self.device) + self.critic = actor_critic.critic.to(self.device) + self.storage = None # initialized later + parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.optimizer = optim.Adam(parameters, lr=learning_rate) + self.transition = RolloutStorage.Transition() + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.value_loss_coef = value_loss_coef + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def init_storage( + self, + num_envs, + num_transitions_per_env, + actor_obs_shape, + critic_obs_shape, + action_shape, + ): + self.storage = RolloutStorage( + num_envs, + num_transitions_per_env, + actor_obs_shape, + critic_obs_shape, + action_shape, + self.device, + ) + + def test_mode(self): + self.actor.test() + self.critic.test() + + def train_mode(self): + self.actor.train() + self.critic.train() + + def act(self, obs, critic_obs): + # * Compute the actions and values + self.transition.actions = self.actor.act(obs).detach() + self.transition.values = self.critic.evaluate(critic_obs).detach() + self.transition.actions_log_prob = self.actor.get_actions_log_prob( + self.transition.actions + ).detach() + self.transition.action_mean = self.actor.action_mean.detach() + self.transition.action_sigma = self.actor.action_std.detach() + # * need to record obs and critic_obs before env.step() + self.transition.observations = obs + self.transition.critic_observations = critic_obs + return self.transition.actions + + def process_env_step(self, rewards, dones, timed_out=None): + self.transition.rewards = rewards.clone() + self.transition.dones = dones + # * Bootstrapping on time outs + if timed_out is not None: + self.transition.rewards += self.gamma * self.transition.values * timed_out + + # * Record the transition + self.storage.add_transitions(self.transition) + self.transition.clear() + + def compute_returns(self, last_critic_obs): + last_values = self.critic.evaluate(last_critic_obs).detach() + self.storage.compute_returns(last_values, self.gamma, self.lam) + + def update(self): + self.mean_value_loss = 0 + self.mean_surrogate_loss = 0 + generator = self.storage.mini_batch_generator( + self.num_mini_batches, self.num_learning_epochs + ) + for ( + obs_batch, + critic_obs_batch, + actions_batch, + target_values_batch, + advantages_batch, + returns_batch, + old_actions_log_prob_batch, + old_mu_batch, + old_sigma_batch, + ) in generator: + self.actor.act(obs_batch) + actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) + value_batch = self.critic.evaluate(critic_obs_batch) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / old_sigma_batch + 1.0e-5) + + ( + torch.square(old_sigma_batch) + + torch.square(old_mu_batch - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + for param_group in self.optimizer.param_groups: + param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch - torch.squeeze(old_actions_log_prob_batch) + ) + surrogate = -torch.squeeze(advantages_batch) * ratio + surrogate_clipped = -torch.squeeze(advantages_batch) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + # * Value function loss + if self.use_clipped_value_loss: + value_clipped = target_values_batch + ( + value_batch - target_values_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (returns_batch - value_batch).pow(2).mean() + + loss = ( + surrogate_loss + + self.value_loss_coef * value_loss + - self.entropy_coef * entropy_batch.mean() + ) + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_( + list(self.actor.parameters()) + list(self.critic.parameters()), + self.max_grad_norm, + ) + self.optimizer.step() + + self.mean_value_loss += value_loss.item() + self.mean_surrogate_loss += surrogate_loss.item() + + num_updates = self.num_learning_epochs * self.num_mini_batches + self.mean_value_loss /= num_updates + self.mean_surrogate_loss /= num_updates + self.storage.clear() diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 627ba58f..5c1cf798 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,5 +1,5 @@ import torch -from learning.algorithms import PPO +from learning.algorithms import * # noqa from learning.modules import ActorCritic from learning.env import VecEnv from learning.utils import remove_zero_weighted_rewards From 1a7977e5e5b3fcb67eb8f620e5d460d6af0fc848 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 16 Feb 2024 17:33:04 -0500 Subject: [PATCH 02/98] PPO2: split optimizers for critic and actor --- gym/envs/mini_cheetah/mini_cheetah_config.py | 2 +- .../mini_cheetah/mini_cheetah_ref_config.py | 5 +- learning/algorithms/ppo.py | 22 +++++- learning/algorithms/ppo2.py | 74 +++++++++++++++---- learning/storage/rollout_storage.py | 2 +- 5 files changed, 82 insertions(+), 23 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index de015b21..502dc571 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -196,7 +196,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): learning_rate = 1.0e-5 schedule = "adaptive" # can be adaptive or fixed discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 2.0 # [s] + # GAE_bootstrap_horizon = 2.0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41bb634f..867a61db 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -145,7 +145,8 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): learning_rate = 5.0e-5 schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 1.0 # [s] + lam = 0.95 + # GAE_bootstrap_horizon = 1.0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 @@ -153,5 +154,5 @@ class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 9c7454bd..f492ef0e 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -69,7 +69,12 @@ def __init__( self.actor_critic = actor_critic self.actor_critic.to(self.device) self.storage = None # initialized later - self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate) + self.optimizer = optim.Adam( + self.actor_critic.actor.parameters(), lr=learning_rate + ) + self.crit_optimizer = optim.Adam( + self.actor_critic.critic.parameters(), lr=learning_rate + ) self.transition = RolloutStorage.Transition() # * PPO parameters @@ -156,7 +161,6 @@ def update(self): actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) - value_batch = self.actor_critic.evaluate(critic_obs_batch) mu_batch = self.actor_critic.action_mean sigma_batch = self.actor_critic.action_std entropy_batch = self.actor_critic.entropy @@ -195,6 +199,7 @@ def update(self): surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() # * Value function loss + value_batch = self.actor_critic.evaluate(critic_obs_batch) if self.use_clipped_value_loss: value_clipped = target_values_batch + ( value_batch - target_values_batch @@ -207,16 +212,25 @@ def update(self): loss = ( surrogate_loss - + self.value_loss_coef * value_loss + # + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean() ) # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) + nn.utils.clip_grad_norm_( + self.actor_critic.actor.parameters(), self.max_grad_norm + ) self.optimizer.step() + self.crit_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_( + self.actor_critic.critic.parameters(), self.max_grad_norm + ) + self.crit_optimizer.step() + self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 6649f396..06e7fabd 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -71,8 +71,9 @@ def __init__( self.actor = actor_critic.actor.to(self.device) self.critic = actor_critic.critic.to(self.device) self.storage = None # initialized later - parameters = list(self.actor.parameters()) + list(self.critic.parameters()) - self.optimizer = optim.Adam(parameters, lr=learning_rate) + # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) self.transition = RolloutStorage.Transition() # * PPO parameters @@ -143,6 +144,9 @@ def compute_returns(self, last_critic_obs): def update(self): self.mean_value_loss = 0 self.mean_surrogate_loss = 0 + + self.update_critic() + generator = self.storage.mini_batch_generator( self.num_mini_batches, self.num_learning_epochs ) @@ -159,7 +163,7 @@ def update(self): ) in generator: self.actor.act(obs_batch) actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) - value_batch = self.critic.evaluate(critic_obs_batch) + # value_batch = self.critic.evaluate(critic_obs_batch) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std entropy_batch = self.actor.entropy @@ -198,19 +202,19 @@ def update(self): surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() # * Value function loss - if self.use_clipped_value_loss: - value_clipped = target_values_batch + ( - value_batch - target_values_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (returns_batch - value_batch).pow(2).mean() + # if self.use_clipped_value_loss: + # value_clipped = target_values_batch + ( + # value_batch - target_values_batch + # ).clamp(-self.clip_param, self.clip_param) + # value_losses = (value_batch - returns_batch).pow(2) + # value_losses_clipped = (value_clipped - returns_batch).pow(2) + # value_loss = torch.max(value_losses, value_losses_clipped).mean() + # else: + # value_loss = (returns_batch - value_batch).pow(2).mean() loss = ( surrogate_loss - + self.value_loss_coef * value_loss + # + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean() ) @@ -223,10 +227,50 @@ def update(self): ) self.optimizer.step() - self.mean_value_loss += value_loss.item() + # self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() num_updates = self.num_learning_epochs * self.num_mini_batches - self.mean_value_loss /= num_updates + # self.mean_value_loss /= num_updates self.mean_surrogate_loss /= num_updates self.storage.clear() + + def update_critic(self): + self.mean_value_loss = 0 + counter = 0 + generator = self.storage.mini_batch_generator( + self.num_mini_batches, self.num_learning_epochs + ) + for ( + obs_batch, + critic_obs_batch, + actions_batch, + target_values_batch, + advantages_batch, + returns_batch, + old_actions_log_prob_batch, + old_mu_batch, + old_sigma_batch, + ) in generator: + value_batch = self.critic.evaluate(critic_obs_batch) + if self.use_clipped_value_loss: + value_clipped = target_values_batch + ( + value_batch - target_values_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (returns_batch - value_batch).pow(2).mean() + + # * Gradient Step + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_policy(self): + pass diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py index bba48e02..38a315d7 100644 --- a/learning/storage/rollout_storage.py +++ b/learning/storage/rollout_storage.py @@ -182,7 +182,7 @@ def get_statistics(self): trajectory_lengths = done_indices[1:] - done_indices[:-1] return trajectory_lengths.float().mean(), self.rewards.mean() - def mini_batch_generator(self, num_mini_batches, num_epochs=8): + def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): batch_size = self.num_envs * self.num_transitions_per_env mini_batch_size = batch_size // num_mini_batches indices = torch.randperm( From 3253a659867370c0ce86889677d02485cacde97d Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 20 Feb 2024 09:33:37 -0500 Subject: [PATCH 03/98] cleaning --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +- learning/algorithms/ppo2.py | 115 +++++++----------- learning/runners/BaseRunner.py | 4 +- 3 files changed, 51 insertions(+), 72 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 867a61db..c59f8a56 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -135,7 +135,7 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # training params - value_loss_coef = 1.0 + value_loss_coef = 1.0 # deprecate for PPO2 use_clipped_value_loss = True clip_param = 0.2 entropy_coef = 0.01 @@ -146,7 +146,7 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 - # GAE_bootstrap_horizon = 1.0 # [s] + GAE_bootstrap_horizon = 2.0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 06e7fabd..64857280 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -49,7 +49,6 @@ def __init__( clip_param=0.2, gamma=0.998, lam=0.95, - value_loss_coef=1.0, entropy_coef=0.0, learning_rate=1e-3, max_grad_norm=1.0, @@ -80,7 +79,6 @@ def __init__( self.clip_param = clip_param self.num_learning_epochs = num_learning_epochs self.num_mini_batches = num_mini_batches - self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam @@ -146,24 +144,65 @@ def update(self): self.mean_surrogate_loss = 0 self.update_critic() + self.update_actor() + self.storage.clear() + def update_critic(self): + self.mean_value_loss = 0 + counter = 0 generator = self.storage.mini_batch_generator( self.num_mini_batches, self.num_learning_epochs ) for ( - obs_batch, + _, critic_obs_batch, - actions_batch, + _, target_values_batch, - advantages_batch, + _, returns_batch, + _, + _, + _, + ) in generator: + value_batch = self.critic.evaluate(critic_obs_batch) + if self.use_clipped_value_loss: + value_clipped = target_values_batch + ( + value_batch - target_values_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (returns_batch - value_batch).pow(2).mean() + + # * Gradient Step + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self): + self.mean_surrogate_loss = 0 + counter = 0 + generator = self.storage.mini_batch_generator( + self.num_mini_batches, self.num_learning_epochs + ) + for ( + obs_batch, + _, + actions_batch, + _, + advantages_batch, + _, old_actions_log_prob_batch, old_mu_batch, old_sigma_batch, ) in generator: self.actor.act(obs_batch) actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) - # value_batch = self.critic.evaluate(critic_obs_batch) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std entropy_batch = self.actor.entropy @@ -201,22 +240,7 @@ def update(self): ) surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() - # * Value function loss - # if self.use_clipped_value_loss: - # value_clipped = target_values_batch + ( - # value_batch - target_values_batch - # ).clamp(-self.clip_param, self.clip_param) - # value_losses = (value_batch - returns_batch).pow(2) - # value_losses_clipped = (value_clipped - returns_batch).pow(2) - # value_loss = torch.max(value_losses, value_losses_clipped).mean() - # else: - # value_loss = (returns_batch - value_batch).pow(2).mean() - - loss = ( - surrogate_loss - # + self.value_loss_coef * value_loss - - self.entropy_coef * entropy_batch.mean() - ) + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() # * Gradient step self.optimizer.zero_grad() @@ -226,51 +250,6 @@ def update(self): self.max_grad_norm, ) self.optimizer.step() - - # self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() - - num_updates = self.num_learning_epochs * self.num_mini_batches - # self.mean_value_loss /= num_updates - self.mean_surrogate_loss /= num_updates - self.storage.clear() - - def update_critic(self): - self.mean_value_loss = 0 - counter = 0 - generator = self.storage.mini_batch_generator( - self.num_mini_batches, self.num_learning_epochs - ) - for ( - obs_batch, - critic_obs_batch, - actions_batch, - target_values_batch, - advantages_batch, - returns_batch, - old_actions_log_prob_batch, - old_mu_batch, - old_sigma_batch, - ) in generator: - value_batch = self.critic.evaluate(critic_obs_batch) - if self.use_clipped_value_loss: - value_clipped = target_values_batch + ( - value_batch - target_values_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (returns_batch - value_batch).pow(2).mean() - - # * Gradient Step - self.critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) - self.critic_optimizer.step() - self.mean_value_loss += value_loss.item() counter += 1 - self.mean_value_loss /= counter - - def update_policy(self): - pass + self.mean_surrogate_loss /= counter diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 5c1cf798..2c8cda65 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,5 +1,5 @@ import torch -from learning.algorithms import * # noqa +from learning.algorithms import * # noqa: F403 from learning.modules import ActorCritic from learning.env import VecEnv from learning.utils import remove_zero_weighted_rewards @@ -19,7 +19,7 @@ def __init__(self, env: VecEnv, train_cfg, device="cpu"): ).to(self.device) alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg: PPO = alg_class(actor_critic, device=self.device, **self.alg_cfg) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] From e3c258971a4db04c2f628b83ab52e945bcc7626e Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 20 Feb 2024 14:17:25 -0500 Subject: [PATCH 04/98] Basic internals of dict_storage in place, fills correctly --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +- learning/algorithms/ppo2.py | 15 ++++--- learning/runners/my_runner.py | 28 +++++++++++- learning/storage/__init__.py | 1 + learning/storage/dict_storage.py | 45 +++++++++++++++++++ 5 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 learning/storage/dict_storage.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index c59f8a56..4ff6ebaa 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - + normalize_obs = False actor_obs = [ "base_ang_vel", "projected_gravity", @@ -142,6 +142,8 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_learning_epochs = 6 # mini batch size = num_envs*nsteps/nminibatches num_mini_batches = 4 + storage_size = 2**17 # new + mini_batch_size = 2**15 # new learning_rate = 5.0e-5 schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 64857280..118920e0 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -153,13 +153,16 @@ def update_critic(self): generator = self.storage.mini_batch_generator( self.num_mini_batches, self.num_learning_epochs ) + + # update storage with values evaluated now + for ( _, critic_obs_batch, _, - target_values_batch, + target_values_batch, # ! eval _, - returns_batch, + returns_batch, # ! calc _, _, _, @@ -195,11 +198,11 @@ def update_actor(self): _, actions_batch, _, - advantages_batch, + advantages_batch, # calc + eval _, - old_actions_log_prob_batch, - old_mu_batch, - old_sigma_batch, + old_actions_log_prob_batch, # eval? + old_mu_batch, # eval? + old_sigma_batch, # eval? ) in generator: self.actor.act(obs_batch) actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 74b842d4..f1f824b8 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -1,4 +1,5 @@ import torch +from tensordict import TensorDict from learning.env import VecEnv from learning.utils import Logger @@ -7,7 +8,10 @@ from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + logger = Logger() +storage = DictStorage() class MyRunner(OnPolicyRunner): @@ -34,9 +38,21 @@ def learn(self): actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - + rewards_dict self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize(transition) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -67,6 +83,16 @@ def learn(self): rewards_dict.update(PBRS.post_step(self.env, dones)) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "actor_obs": actor_obs, + "critic_obs": critic_obs, + "rewards": total_rewards, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index edd7bd55..f10df85f 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,3 +3,4 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage +from .dict_storage import DictStorage \ No newline at end of file diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py new file mode 100644 index 00000000..2b5c8e3b --- /dev/null +++ b/learning/storage/dict_storage.py @@ -0,0 +1,45 @@ +import torch +from tensordict import TensorDict + + +class DictStorage: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.storage = TensorDict({}, batch_size=(num_envs, max_length), device=device) + self.fill_count = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: + self.storage[key] = torch.zeros( + (self.num_envs, self.max_length), + device=self.device, + ) + else: + self.storage[key] = torch.zeros( + (self.num_envs, self.max_length, dummy_dict[key].shape[1]), + device=self.device, + ) + + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length: + raise AssertionError("Rollout buffer overflow") + self.storage[:, self.fill_count] = transition + self.fill_count += 1 + + def clear(self): + self.fill_count = 0 + + def mini_batch_generator(self, keys, mini_batch_size, num_epochs): + pass From 7c1166b9048e1dcd228596a6fa84a89dcee67eaf Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 20 Feb 2024 18:57:19 -0500 Subject: [PATCH 05/98] WIP: change storage to be (n_steps, n_envs, var_dim), among other things --- learning/algorithms/ppo2.py | 32 ----------- learning/runners/my_runner.py | 11 +++- learning/storage/dict_storage.py | 14 ++--- learning/utils/__init__.py | 37 ++----------- learning/utils/utils.py | 93 -------------------------------- 5 files changed, 21 insertions(+), 166 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 118920e0..a5ef5ac4 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -1,35 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - import torch import torch.nn as nn import torch.optim as optim diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index f1f824b8..4a20880d 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -5,6 +5,7 @@ from learning.utils import Logger from learning.utils import PotentialBasedRewardShaping from learning.utils import remove_zero_weighted_rewards +# from learning.utils import compute_MC_returns, compute_generalized_advantages from .on_policy_runner import OnPolicyRunner @@ -46,6 +47,7 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), @@ -86,19 +88,26 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "actions": actions, "critic_obs": critic_obs, "rewards": total_rewards, "dones": dones, } ) - storage.add_transitions(transition) logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) self.alg.process_env_step(total_rewards, dones, timed_out) + storage.add_transitions(transition) + self.alg.compute_returns(critic_obs) + # compute_MC_returns(storage.data, self.alg.gamma) + # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic.eval) + # compute_generalized_advantages( + # storage.data, self.alg.gamma, self.alg.lam, self.alg.critic.eval + # ) logger.toc("collection") logger.tic("learning") diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 2b5c8e3b..135684d3 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -17,25 +17,25 @@ def initialize( self.num_envs = num_envs max_length = max_storage // num_envs self.max_length = max_length - self.storage = TensorDict({}, batch_size=(num_envs, max_length), device=device) + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) self.fill_count = 0 for key in dummy_dict.keys(): - if dummy_dict[key].dim() == 1: - self.storage[key] = torch.zeros( - (self.num_envs, self.max_length), + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), device=self.device, ) else: - self.storage[key] = torch.zeros( - (self.num_envs, self.max_length, dummy_dict[key].shape[1]), + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), device=self.device, ) def add_transitions(self, transition: TensorDict): if self.fill_count >= self.max_length: raise AssertionError("Rollout buffer overflow") - self.storage[:, self.fill_count] = transition + self.data[self.fill_count] = transition self.fill_count += 1 def clear(self): diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index a39a3daf..8559714e 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,40 +1,11 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin from .utils import ( - split_and_pad_trajectories, - unpad_trajectories, remove_zero_weighted_rewards, set_discount_from_horizon, ) +from .dict_utils import ( + compute_generalized_advantages, + compute_MC_returns +) from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/utils.py b/learning/utils/utils.py index b1a20bcf..7a54c65a 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -1,96 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - -import torch - - -def split_and_pad_trajectories(tensor, dones): - """ - Splits trajectories at done indices. Then concatenates them and pads - with zeros up to the length og the longest trajectory. - Returns masks corresponding to valid parts of the trajectories - Example: - Input: [ [a1, a2, a3, a4 | a5, a6], - [b1, b2 | b3, b4, b5 | b6] - ] - - Output:[ [a1, a2, a3, a4], | [ [True, True, True, True], - [a5, a6, 0, 0], | [True, True, False, False], - [b1, b2, 0, 0], | [True, True, False, False], - [b3, b4, b5, 0], | [True, True, True, False], - [b6, 0, 0, 0] | [True, False, False, False], - ] | ] - - Assumes that the inputy has the following dimension order: - [time, number of envs, aditional dimensions] - """ - dones = dones.clone() - dones[-1] = 1 - # * Permute the buffers to have order - # * (num_envs, num_transitions_per_env, ...), for correct reshaping - flat_dones = dones.transpose(1, 0).reshape(-1, 1) - - # * Get length of trajectory by counting the number of - # * successive not done elements - done_indices = torch.cat( - ( - flat_dones.new_tensor([-1], dtype=torch.int64), - flat_dones.nonzero()[:, 0], - ) - ) - trajectory_lengths = done_indices[1:] - done_indices[:-1] - trajectory_lengths_list = trajectory_lengths.tolist() - # * Extract the individual trajectories - trajectories = torch.split( - tensor.transpose(1, 0).flatten(0, 1), trajectory_lengths_list - ) - padded_trajectories = torch.nn.utils.rnn.pad_sequence(trajectories) - - trajectory_masks = trajectory_lengths > torch.arange( - 0, tensor.shape[0], device=tensor.device - ).unsqueeze(1) - return padded_trajectories, trajectory_masks - - -def unpad_trajectories(trajectories, masks): - """Does the inverse operation of split_and_pad_trajectories()""" - # * Need to transpose before and after the masking to have proper reshaping - return ( - trajectories.transpose(1, 0)[masks.transpose(1, 0)] - .view(-1, trajectories.shape[0], trajectories.shape[-1]) - .transpose(1, 0) - ) - - def remove_zero_weighted_rewards(reward_weights): for name in list(reward_weights.keys()): if reward_weights[name] == 0: From fa05a31b1efbbd341f1aa8e142485c334a7b601d Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 21 Feb 2024 09:39:33 -0500 Subject: [PATCH 06/98] implemented compute advantages (returns), validated against existing. Same up to 1e-5 on individual values. Because of differencing, this can end up in much larger differences (1e0) occasionally, after normalization. --- learning/runners/my_runner.py | 29 +++++++++++++----- learning/storage/dict_storage.py | 2 ++ learning/utils/dict_utils.py | 52 ++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 learning/utils/dict_utils.py diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 4a20880d..08721b5e 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -5,7 +5,7 @@ from learning.utils import Logger from learning.utils import PotentialBasedRewardShaping from learning.utils import remove_zero_weighted_rewards -# from learning.utils import compute_MC_returns, compute_generalized_advantages +from learning.utils import compute_generalized_advantages from .on_policy_runner import OnPolicyRunner @@ -53,7 +53,7 @@ def learn(self): "dones": self.get_timed_out(), } ) - storage.initialize(transition) + storage.initialize(transition, device=self.device) logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): @@ -69,6 +69,14 @@ def learn(self): self.policy_cfg["disable_actions"], ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + PBRS.pre_step(self.env) self.env.step() @@ -87,9 +95,6 @@ def learn(self): transition.update( { - "actor_obs": actor_obs, - "actions": actions, - "critic_obs": critic_obs, "rewards": total_rewards, "dones": dones, } @@ -104,10 +109,20 @@ def learn(self): self.alg.compute_returns(critic_obs) # compute_MC_returns(storage.data, self.alg.gamma) - # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic.eval) + # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic) # compute_generalized_advantages( - # storage.data, self.alg.gamma, self.alg.lam, self.alg.critic.eval + # storage.data, + # self.alg.gamma, + # self.alg.lam, + # self.alg.critic, # ) + compute_generalized_advantages( + storage.data, + self.alg.gamma, + self.alg.lam, + self.alg.critic, + self.alg.critic.evaluate(critic_obs), + ) logger.toc("collection") logger.tic("learning") diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 135684d3..3155ca28 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -24,11 +24,13 @@ def initialize( if dummy_dict[key].dim() == 1: # if scalar self.data[key] = torch.zeros( (max_length, num_envs), + dtype=dummy_dict[key].dtype, device=self.device, ) else: self.data[key] = torch.zeros( (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, device=self.device, ) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py new file mode 100644 index 00000000..c1b3cb4f --- /dev/null +++ b/learning/utils/dict_utils.py @@ -0,0 +1,52 @@ +import torch +from tensordict import TensorDict + + +def compute_MC_returns(data: TensorDict, gamma, critic=None): + if critic is None: + last_values = torch.zeros_like(data["rewards"][0]) + else: + last_values = critic.evaluate(data["critic_obs"][-1]) + + data.update({"returns": torch.zeros_like(data["rewards"])}) + data["returns"][-1] = last_values * ~data["dones"][-1] + for k in reversed(range(data["rewards"].shape[0] - 1)): + not_done = ~data["dones"][k] + data["returns"][k] = ( + data["rewards"][k] + gamma * data["returns"][k + 1] * not_done + ) + return + + +def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + if "values" not in data.keys(): + data.update({"values": critic.evaluate(data["critic_obs"])}) + + if "advantages" not in data.keys(): + data.update({"advantages": torch.zeros_like(data["values"])}) + else: + data["advantages"].zero_() + + if last_values is not None: + # todo check this + # since we don't have observations for the last step, need last value plugged in + data["advantages"][-1] = ( + data["rewards"][-1] + + gamma * last_values * ~data["dones"][-1] + - data["values"][-1] + ) + + for k in reversed(range(data["values"].shape[0] - 1)): + not_done = ~data["dones"][k] + td_error = ( + data["rewards"][k] + + gamma * data["values"][k + 1] * not_done + - data["values"][k] + ) + data["advantages"][k] = ( + td_error + gamma * lam * not_done * data["advantages"][k + 1] + ) + # data["returns"] = data["advantages"] + data["values"] + data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( + data["advantages"].std() + 1e-8 + ) From 5c5d38b2a5e79f26c4776a6806b5d4d8abb99ff5 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 21 Feb 2024 13:52:55 -0500 Subject: [PATCH 07/98] critic update runs, but is very slow. Not clear why --- learning/algorithms/ppo2.py | 29 +++++++++++++++++++++++++++++ learning/runners/my_runner.py | 3 +++ learning/storage/dict_storage.py | 4 +--- learning/utils/__init__.py | 5 +---- learning/utils/dict_utils.py | 17 +++++++++++++++++ 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index a5ef5ac4..1a6e475f 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -4,6 +4,7 @@ # from learning.modules import ActorCritic from learning.storage import RolloutStorage +from learning.utils import create_uniform_generator, compute_MC_returns class PPO2: @@ -115,6 +116,34 @@ def update(self): self.update_actor() self.storage.clear() + def update_critic2(self, data): + self.mean_value_loss = 0 + counter = 0 + compute_MC_returns(data, self.gamma, self.critic) + generator = create_uniform_generator(data, 500, self.num_learning_epochs) + # with torch.inference_mode(): + # target_values = self.critic.evaluate(data["critic_obs"]).detach() + for batch in generator: + value_batch = self.critic.evaluate(batch["critic_obs"]) + if self.use_clipped_value_loss: + target_value_batch = batch["values"] + value_clipped = target_value_batch + ( + value_batch - target_value_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - target_value_batch).pow(2) + value_losses_clipped = (value_clipped - batch["returns"]).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (batch["returns"] - value_batch).pow(2).mean() + + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + def update_critic(self): self.mean_value_loss = 0 counter = 0 diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 08721b5e..7277dda6 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -105,6 +105,7 @@ def learn(self): logger.finish_step(dones) self.alg.process_env_step(total_rewards, dones, timed_out) + # doesn't need detach becuase we have no_inference storage.add_transitions(transition) self.alg.compute_returns(critic_obs) @@ -126,6 +127,7 @@ def learn(self): logger.toc("collection") logger.tic("learning") + self.alg.update_critic2(storage.data) self.alg.update() logger.toc("learning") logger.log_category() @@ -137,6 +139,7 @@ def learn(self): if self.it % self.save_interval == 0: self.save() + storage.clear() self.save() def update_rewards(self, rewards_dict, terminated): diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 3155ca28..8354b179 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -34,6 +34,7 @@ def initialize( device=self.device, ) + @torch.inference_mode def add_transitions(self, transition: TensorDict): if self.fill_count >= self.max_length: raise AssertionError("Rollout buffer overflow") @@ -42,6 +43,3 @@ def add_transitions(self, transition: TensorDict): def clear(self): self.fill_count = 0 - - def mini_batch_generator(self, keys, mini_batch_size, num_epochs): - pass diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index 8559714e..15c674d6 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -3,9 +3,6 @@ remove_zero_weighted_rewards, set_discount_from_horizon, ) -from .dict_utils import ( - compute_generalized_advantages, - compute_MC_returns -) +from .dict_utils import * from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index c1b3cb4f..1480c48e 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -2,6 +2,7 @@ from tensordict import TensorDict +@torch.no_grad def compute_MC_returns(data: TensorDict, gamma, critic=None): if critic is None: last_values = torch.zeros_like(data["rewards"][0]) @@ -18,6 +19,7 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): return +@torch.no_grad def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): if "values" not in data.keys(): data.update({"values": critic.evaluate(data["critic_obs"])}) @@ -50,3 +52,18 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( data["advantages"].std() + 1e-8 ) + + +# todo change num_epochs to num_batches +@torch.no_grad +def create_uniform_generator(data, batch_size, num_epochs, keys=None): + n, m = data.shape + total_data = n * m + num_batches_per_epoch = total_data // batch_size + for epoch in range(num_epochs): + indices = torch.randperm(total_data, device=data.device) + for i in range(num_batches_per_epoch): + batched_data = data.flatten(0, 1)[ + indices[i * batch_size : (i + 1) * batch_size] + ] + yield batched_data From afa0be251745f719f7114792a8f5140674dd8369 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 21 Feb 2024 20:13:14 -0500 Subject: [PATCH 08/98] Learning critic with dict storage now working. Left without boot-strapping on last step, seems like it makes no difference. Boot-strapping on time-outs made a big difference though. --- learning/algorithms/ppo2.py | 29 +++++++++++++++++++++-------- learning/runners/my_runner.py | 17 +++++++++-------- learning/utils/dict_utils.py | 20 ++++++++++++-------- 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 1a6e475f..d4ecca81 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -4,7 +4,10 @@ # from learning.modules import ActorCritic from learning.storage import RolloutStorage -from learning.utils import create_uniform_generator, compute_MC_returns +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, +) class PPO2: @@ -109,18 +112,28 @@ def compute_returns(self, last_critic_obs): self.storage.compute_returns(last_values, self.gamma, self.lam) def update(self): - self.mean_value_loss = 0 - self.mean_surrogate_loss = 0 - - self.update_critic() + # self.update_critic() self.update_actor() self.storage.clear() - def update_critic2(self, data): + def update_critic2(self, data, last_obs=None): self.mean_value_loss = 0 counter = 0 - compute_MC_returns(data, self.gamma, self.critic) - generator = create_uniform_generator(data, 500, self.num_learning_epochs) + + if last_obs is not None: + with torch.no_grad(): + last_values = self.critic.evaluate(last_obs).detach() + else: + last_values = None + # compute_MC_returns(data, self.gamma, self.critic) + compute_generalized_advantages( + data, self.gamma, self.lam, self.critic, last_values + ) + # ! temp hack + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) # with torch.inference_mode(): # target_values = self.critic.evaluate(data["critic_obs"]).detach() for batch in generator: diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 7277dda6..9b3f1939 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -5,7 +5,6 @@ from learning.utils import Logger from learning.utils import PotentialBasedRewardShaping from learning.utils import remove_zero_weighted_rewards -from learning.utils import compute_generalized_advantages from .on_policy_runner import OnPolicyRunner @@ -96,6 +95,7 @@ def learn(self): transition.update( { "rewards": total_rewards, + "timed_out": timed_out, "dones": dones, } ) @@ -117,16 +117,17 @@ def learn(self): # self.alg.lam, # self.alg.critic, # ) - compute_generalized_advantages( - storage.data, - self.alg.gamma, - self.alg.lam, - self.alg.critic, - self.alg.critic.evaluate(critic_obs), - ) + # compute_generalized_advantages( + # storage.data, + # self.alg.gamma, + # self.alg.lam, + # self.alg.critic, + # self.alg.critic.evaluate(critic_obs), + # ) logger.toc("collection") logger.tic("learning") + # self.alg.update_critic2(storage.data, last_obs=critic_obs) self.alg.update_critic2(storage.data) self.alg.update() logger.toc("learning") diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 1480c48e..2d19e072 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -16,25 +16,26 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): data["returns"][k] = ( data["rewards"][k] + gamma * data["returns"][k + 1] * not_done ) + data["returns"] = (data["returns"] - data["returns"].mean()) / ( + data["returns"].std() + 1e-8 + ) return @torch.no_grad def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - if "values" not in data.keys(): - data.update({"values": critic.evaluate(data["critic_obs"])}) + data.update({"values": critic.evaluate(data["critic_obs"])}) - if "advantages" not in data.keys(): - data.update({"advantages": torch.zeros_like(data["values"])}) - else: - data["advantages"].zero_() + data.update({"advantages": torch.zeros_like(data["values"])}) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in + not_done = ~data["dones"][-1] data["advantages"][-1] = ( data["rewards"][-1] - + gamma * last_values * ~data["dones"][-1] + + gamma * data["values"][-1] * data["timed_out"][-1] + + gamma * last_values * not_done - data["values"][-1] ) @@ -42,13 +43,16 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): not_done = ~data["dones"][k] td_error = ( data["rewards"][k] + + gamma * data["values"][k] * data["timed_out"][k] + gamma * data["values"][k + 1] * not_done - data["values"][k] ) data["advantages"][k] = ( td_error + gamma * lam * not_done * data["advantages"][k + 1] ) - # data["returns"] = data["advantages"] + data["values"] + + data["returns"] = data["advantages"] + data["values"] + data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( data["advantages"].std() + 1e-8 ) From 12661f0eb85e12b22587b145f53f2f2f625e6b79 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sun, 25 Feb 2024 15:29:20 -0500 Subject: [PATCH 09/98] add tensordict to reqs --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d08d918a..10e4b8c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ mss ruff pre-commit onnx -onnxruntime \ No newline at end of file +onnxruntime +tensordict \ No newline at end of file From 679a729b330c753e0b8083fb093dd1bd87a485a0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 15:10:36 -0500 Subject: [PATCH 10/98] WIP: works now, but not with normalization turned on --- gym/envs/base/fixed_robot_config.py | 2 +- gym/envs/base/legged_robot_config.py | 2 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 3 +- .../mini_cheetah/mini_cheetah_ref_config.py | 7 - learning/algorithms/ppo.py | 5 + learning/algorithms/ppo2.py | 168 +++++------------- learning/modules/critic.py | 3 +- learning/modules/utils/normalize.py | 57 ++---- learning/runners/BaseRunner.py | 3 - learning/runners/__init__.py | 3 +- learning/runners/my_runner.py | 51 +----- learning/runners/on_policy_runner.py | 54 ++++-- learning/storage/dict_storage.py | 3 + 13 files changed, 113 insertions(+), 248 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index a59cc9eb..4d039f74 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -182,7 +182,7 @@ class algorithm: class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 0de83e0e..2955f5a5 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -300,7 +300,7 @@ class algorithm: class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1500 save_interval = 50 diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 502dc571..689fcd64 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -124,6 +124,7 @@ class scaling(LeggedRobotCfg.scaling): class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" class policy(LeggedRobotRunnerCfg.policy): actor_hidden_dims = [256, 256, 128] @@ -204,5 +205,5 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" max_iterations = 500 - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 4ff6ebaa..f7699f32 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -123,13 +123,6 @@ class weights: swing_grf = 1.5 stance_grf = 1.5 - class pbrs_weights: - reference_traj = 0.0 - swing_grf = 0.0 - stance_grf = 0.0 - min_base_height = 0.0 - orientation = 0.0 - class termination_weight: termination = 0.15 diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index f492ef0e..92898ad5 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -126,6 +126,11 @@ def act(self, obs, critic_obs): return self.transition.actions def process_env_step(self, rewards, dones, timed_out=None): + if self.storage is None: + raise AttributeError( + "This version of PPO is deprecated and only works with OldPolicyRunner." + "Use PPO2 instead." + ) self.transition.rewards = rewards.clone() self.transition.dones = dones # * Bootstrapping on time outs diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index d4ecca81..ae5226d2 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -2,8 +2,6 @@ import torch.nn as nn import torch.optim as optim -# from learning.modules import ActorCritic -from learning.storage import RolloutStorage from learning.utils import ( create_uniform_generator, compute_generalized_advantages, @@ -45,7 +43,6 @@ def __init__( # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) - self.transition = RolloutStorage.Transition() # * PPO parameters self.clip_param = clip_param @@ -57,23 +54,6 @@ def __init__( self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - def init_storage( - self, - num_envs, - num_transitions_per_env, - actor_obs_shape, - critic_obs_shape, - action_shape, - ): - self.storage = RolloutStorage( - num_envs, - num_transitions_per_env, - actor_obs_shape, - critic_obs_shape, - action_shape, - self.device, - ) - def test_mode(self): self.actor.test() self.critic.test() @@ -83,61 +63,33 @@ def train_mode(self): self.critic.train() def act(self, obs, critic_obs): - # * Compute the actions and values - self.transition.actions = self.actor.act(obs).detach() - self.transition.values = self.critic.evaluate(critic_obs).detach() - self.transition.actions_log_prob = self.actor.get_actions_log_prob( - self.transition.actions - ).detach() - self.transition.action_mean = self.actor.action_mean.detach() - self.transition.action_sigma = self.actor.action_std.detach() - # * need to record obs and critic_obs before env.step() - self.transition.observations = obs - self.transition.critic_observations = critic_obs - return self.transition.actions - - def process_env_step(self, rewards, dones, timed_out=None): - self.transition.rewards = rewards.clone() - self.transition.dones = dones - # * Bootstrapping on time outs - if timed_out is not None: - self.transition.rewards += self.gamma * self.transition.values * timed_out - - # * Record the transition - self.storage.add_transitions(self.transition) - self.transition.clear() - - def compute_returns(self, last_critic_obs): - last_values = self.critic.evaluate(last_critic_obs).detach() - self.storage.compute_returns(last_values, self.gamma, self.lam) - - def update(self): - # self.update_critic() - self.update_actor() - self.storage.clear() - - def update_critic2(self, data, last_obs=None): - self.mean_value_loss = 0 - counter = 0 + return self.actor.act(obs).detach() - if last_obs is not None: + def update(self, data, last_obs=None): + if last_obs is None: + last_values = None + else: with torch.no_grad(): last_values = self.critic.evaluate(last_obs).detach() - else: - last_values = None - # compute_MC_returns(data, self.gamma, self.critic) compute_generalized_advantages( data, self.gamma, self.lam, self.critic, last_values ) - # ! temp hack + + self.update_critic(data) + self.update_actor(data) + + def update_critic(self, data, last_obs=None): + self.mean_value_loss = 0 + counter = 0 + n, m = data.shape total_data = n * m batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) - # with torch.inference_mode(): - # target_values = self.critic.evaluate(data["critic_obs"]).detach() for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) + with torch.inference_mode(): + value_batch = self.critic.evaluate(batch["critic_obs"]) + if self.use_clipped_value_loss: target_value_batch = batch["values"] value_clipped = target_value_batch + ( @@ -157,65 +109,27 @@ def update_critic2(self, data, last_obs=None): counter += 1 self.mean_value_loss /= counter - def update_critic(self): - self.mean_value_loss = 0 + def update_actor(self, data): + # already done before + # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) + self.mean_surrogate_loss = 0 counter = 0 - generator = self.storage.mini_batch_generator( - self.num_mini_batches, self.num_learning_epochs - ) - - # update storage with values evaluated now - - for ( - _, - critic_obs_batch, - _, - target_values_batch, # ! eval - _, - returns_batch, # ! calc - _, - _, - _, - ) in generator: - value_batch = self.critic.evaluate(critic_obs_batch) - if self.use_clipped_value_loss: - value_clipped = target_values_batch + ( - value_batch - target_values_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (returns_batch - value_batch).pow(2).mean() - # * Gradient Step - self.critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) - self.critic_optimizer.step() - self.mean_value_loss += value_loss.item() - counter += 1 - self.mean_value_loss /= counter + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() - def update_actor(self): - self.mean_surrogate_loss = 0 - counter = 0 - generator = self.storage.mini_batch_generator( - self.num_mini_batches, self.num_learning_epochs - ) - for ( - obs_batch, - _, - actions_batch, - _, - advantages_batch, # calc + eval - _, - old_actions_log_prob_batch, # eval? - old_mu_batch, # eval? - old_sigma_batch, # eval? - ) in generator: - self.actor.act(obs_batch) - actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + # ! refactor how this is done + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std entropy_batch = self.actor.entropy @@ -224,10 +138,10 @@ def update_actor(self): if self.desired_kl is not None and self.schedule == "adaptive": with torch.inference_mode(): kl = torch.sum( - torch.log(sigma_batch / old_sigma_batch + 1.0e-5) + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + ( - torch.square(old_sigma_batch) - + torch.square(old_mu_batch - mu_batch) + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) ) / (2.0 * torch.square(sigma_batch)) - 0.5, @@ -241,14 +155,16 @@ def update_actor(self): self.learning_rate = min(1e-2, self.learning_rate * 1.5) for param_group in self.optimizer.param_groups: + # ! check this param_group["lr"] = self.learning_rate # * Surrogate loss ratio = torch.exp( - actions_log_prob_batch - torch.squeeze(old_actions_log_prob_batch) + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) ) - surrogate = -torch.squeeze(advantages_batch) * ratio - surrogate_clipped = -torch.squeeze(advantages_batch) * torch.clamp( + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( ratio, 1.0 - self.clip_param, 1.0 + self.clip_param ) surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..9b11f3ee 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -31,5 +31,6 @@ def evaluate(self, critic_observations): return self.NN(critic_observations).squeeze() def normalize(self, observation): - with torch.no_grad(): + # with torch.no_grad(): + with torch.inference_mode(): return self.obs_rms(observation) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index 5623b80c..a8e52fd2 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -2,22 +2,6 @@ import torch.nn as nn -def get_mean_var_with_masks(values, masks): - """ - Applies a mask to the input values and calculates - the mean and variance over the valid entries, as specified - by the mask. - """ - sum_mask = masks.sum() - masked_vals = values * masks - values_mean = masked_vals.sum() / sum_mask - min_sqr = (((masked_vals) ** 2) / sum_mask).sum() - ( - (masked_vals / sum_mask).sum() - ) ** 2 - values_var = min_sqr * sum_mask / (sum_mask - 1) - return values_mean, values_var - - class RunningMeanStd(nn.Module): """ Keeps a running mean to normalize tensor of choice. @@ -44,11 +28,6 @@ def _update_mean_var_from_moments( batch_var, batch_count, ): - """ - Implements parallel algorithm for combining arbitrary sets A and B - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - #Parallel_algorithm - """ tot_count = running_count + batch_count delta = batch_mean - running_mean @@ -64,28 +43,22 @@ def _update_mean_var_from_moments( new_var = M2 / (tot_count - 1) return new_mean, new_var, tot_count - def forward(self, input, mask=None): - """ - Returns the normalized version of the input. - """ + def forward(self, input): if self.training: - if mask is not None: - mean, var = get_mean_var_with_masks(input, mask) - else: - mean = input.mean(self.axis) - var = input.var(self.axis) - ( - self.running_mean, - self.running_var, - self.count, - ) = self._update_mean_var_from_moments( - self.running_mean, - self.running_var, - self.count, - mean, - var, - input.size()[0], - ) + mean = input.mean(self.axis) + var = input.var(self.axis) + ( + self.running_mean, + self.running_var, + self.count, + ) = self._update_mean_var_from_moments( + self.running_mean, + self.running_var, + self.count, + mean, + var, + input.size()[0], + ) current_mean = self.running_mean current_var = self.running_var diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 2c8cda65..53e88835 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -26,9 +26,6 @@ def __init__(self, env: VecEnv, train_cfg, device="cpu"): self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 - - # * init storage and model - self.init_storage() self.log_dir = train_cfg["log_dir"] def parse_train_cfg(self, train_cfg): diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index a24a0c9c..b0f49217 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -31,4 +31,5 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .on_policy_runner import OnPolicyRunner -from .my_runner import MyRunner \ No newline at end of file +from .my_runner import MyRunner +from .old_policy_runner import OldPolicyRunner \ No newline at end of file diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 9b3f1939..e25ca265 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -3,11 +3,8 @@ from learning.env import VecEnv from learning.utils import Logger -from learning.utils import PotentialBasedRewardShaping -from learning.utils import remove_zero_weighted_rewards from .on_policy_runner import OnPolicyRunner - from learning.storage import DictStorage logger = Logger() @@ -26,11 +23,6 @@ def __init__(self, env: VecEnv, train_cfg, device="cpu"): def learn(self): self.set_up_logger() - remove_zero_weighted_rewards(self.policy_cfg["reward"]["pbrs_weights"]) - PBRS = PotentialBasedRewardShaping( - self.policy_cfg["reward"]["pbrs_weights"], self.device - ) - logger.register_rewards(PBRS.get_reward_keys()) rewards_dict = {} @@ -76,7 +68,6 @@ def learn(self): } ) - PBRS.pre_step(self.env) self.env.step() actor_obs = self.get_noisy_obs( @@ -89,7 +80,6 @@ def learn(self): dones = timed_out | terminated self.update_rewards(rewards_dict, terminated) - rewards_dict.update(PBRS.post_step(self.env, dones)) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) transition.update( @@ -99,37 +89,16 @@ def learn(self): "dones": dones, } ) + storage.add_transitions(transition) logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - # doesn't need detach becuase we have no_inference - storage.add_transitions(transition) - - self.alg.compute_returns(critic_obs) - # compute_MC_returns(storage.data, self.alg.gamma) - # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic) - # compute_generalized_advantages( - # storage.data, - # self.alg.gamma, - # self.alg.lam, - # self.alg.critic, - # ) - # compute_generalized_advantages( - # storage.data, - # self.alg.gamma, - # self.alg.lam, - # self.alg.critic, - # self.alg.critic.evaluate(critic_obs), - # ) logger.toc("collection") logger.tic("learning") - # self.alg.update_critic2(storage.data, last_obs=critic_obs) - self.alg.update_critic2(storage.data) - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") logger.log_category() @@ -143,20 +112,6 @@ def learn(self): storage.clear() self.save() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated - ) - ) - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["weights"], - modifier=self.env.dt, - mask=~terminated, - ) - ) - def set_up_logger(self): logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) logger.register_rewards( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9820d749..4cef9ec4 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -1,12 +1,15 @@ import os import torch +from tensordict import TensorDict from learning.env import VecEnv from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.storage import DictStorage logger = Logger() +storage = DictStorage() class OnPolicyRunner(BaseRunner): @@ -28,9 +31,22 @@ def learn(self): actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - + rewards_dict self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize(transition, device=self.device) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -45,6 +61,14 @@ def learn(self): self.policy_cfg["disable_actions"], ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + self.env.step() actor_obs = self.get_noisy_obs( @@ -59,16 +83,23 @@ def learn(self): self.update_rewards(rewards_dict, terminated) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") logger.log_category() @@ -79,6 +110,7 @@ def learn(self): if self.it % self.save_interval == 0: self.save() + storage.clear() self.save() def update_rewards(self, rewards_dict, terminated): @@ -139,15 +171,3 @@ def get_inference_actions(self): def export(self, path): self.alg.actor_critic.export_policy(path) - - def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - self.alg.init_storage( - self.env.num_envs, - self.num_steps_per_env, - actor_obs_shape=[num_actor_obs], - critic_obs_shape=[num_critic_obs], - action_shape=[num_actions], - ) diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 8354b179..2191093c 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -43,3 +43,6 @@ def add_transitions(self, transition: TensorDict): def clear(self): self.fill_count = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() From 6dc86fd23869cbce2141b774f4070639759339f7 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 15:18:30 -0500 Subject: [PATCH 11/98] fix normalization, make it work for (n, m, var_dim) shaped data --- learning/algorithms/ppo2.py | 4 +--- learning/modules/critic.py | 3 +-- learning/modules/utils/normalize.py | 7 +++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index ae5226d2..69d6dcd2 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -87,9 +87,7 @@ def update_critic(self, data, last_obs=None): batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: - with torch.inference_mode(): - value_batch = self.critic.evaluate(batch["critic_obs"]) - + value_batch = self.critic.evaluate(batch["critic_obs"]) if self.use_clipped_value_loss: target_value_batch = batch["values"] value_clipped = target_value_batch + ( diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 9b11f3ee..47e0decf 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -31,6 +31,5 @@ def evaluate(self, critic_observations): return self.NN(critic_observations).squeeze() def normalize(self, observation): - # with torch.no_grad(): - with torch.inference_mode(): + with torch.no_grad(): return self.obs_rms(observation) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index a8e52fd2..bbea22da 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -7,10 +7,9 @@ class RunningMeanStd(nn.Module): Keeps a running mean to normalize tensor of choice. """ - def __init__(self, num_items, axis=0, epsilon=1e-05): + def __init__(self, num_items, epsilon=1e-05): super(RunningMeanStd, self).__init__() self.num_items = num_items - self.axis = axis self.epsilon = epsilon self.register_buffer( @@ -45,8 +44,8 @@ def _update_mean_var_from_moments( def forward(self, input): if self.training: - mean = input.mean(self.axis) - var = input.var(self.axis) + mean = input.mean(tuple(range(input.dim() - 1))) + var = input.var(tuple(range(input.dim() - 1))) ( self.running_mean, self.running_var, From 9fc831e6546ddc5c9f9c2b409ca0066ce4446d68 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 15:41:23 -0500 Subject: [PATCH 12/98] some cleanup and fixes on normalization --- .../mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/modules/actor.py | 14 +++------- learning/modules/critic.py | 7 ++--- learning/modules/utils/normalize.py | 28 ++++++++----------- 4 files changed, 19 insertions(+), 32 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index f7699f32..7911206a 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = False + normalize_obs = True actor_obs = [ "base_ang_vel", "projected_gravity", diff --git a/learning/modules/actor.py b/learning/modules/actor.py index f9c82396..7f40d502 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -51,9 +51,7 @@ def entropy(self): return self.distribution.entropy().sum(dim=-1) def update_distribution(self, observations): - if self._normalize_obs: - observations = self.normalize(observations) - mean = self.NN(observations) + mean = self.act_inference(observations) self.distribution = Normal(mean, mean * 0.0 + self.std) def act(self, observations): @@ -65,16 +63,12 @@ def get_actions_log_prob(self, actions): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) - actions_mean = self.NN(observations) - return actions_mean + with torch.no_grad(): + observations = self.obs_rms(observations) + return self.NN(observations) def forward(self, observations): return self.act_inference(observations) - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) - def export(self, path): export_network(self, "policy", path, self.num_obs) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..7ebe76c8 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -27,9 +27,6 @@ def __init__( def evaluate(self, critic_observations): if self._normalize_obs: - critic_observations = self.normalize(critic_observations) + with torch.no_grad(): + critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() - - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index bbea22da..d00b69b5 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -3,10 +3,6 @@ class RunningMeanStd(nn.Module): - """ - Keeps a running mean to normalize tensor of choice. - """ - def __init__(self, num_items, epsilon=1e-05): super(RunningMeanStd, self).__init__() self.num_items = num_items @@ -46,18 +42,18 @@ def forward(self, input): if self.training: mean = input.mean(tuple(range(input.dim() - 1))) var = input.var(tuple(range(input.dim() - 1))) - ( - self.running_mean, - self.running_var, - self.count, - ) = self._update_mean_var_from_moments( - self.running_mean, - self.running_var, - self.count, - mean, - var, - input.size()[0], - ) + ( + self.running_mean, + self.running_var, + self.count, + ) = self._update_mean_var_from_moments( + self.running_mean, + self.running_var, + self.count, + mean, + var, + input.size()[0], + ) current_mean = self.running_mean current_var = self.running_var From 6b9a2c9324fa9daf56631149e326e5025ff073bc Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 16 Mar 2024 12:04:58 -0400 Subject: [PATCH 13/98] add coupling to humanoid torques --- gym/envs/mit_humanoid/mit_humanoid.py | 30 +++++++++----------- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 +- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 74e35d5a..10169afc 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -12,23 +12,19 @@ def _init_buffers(self): super()._init_buffers() def _compute_torques(self): - self.desired_pos_target = self.dof_pos_target + self.default_dof_pos - q = self.dof_pos.clone() - qd = self.dof_vel.clone() - q_des = self.desired_pos_target.clone() - qd_des = self.dof_vel_target.clone() - tau_ff = self.tau_ff.clone() - kp = self.p_gains.clone() - kd = self.d_gains.clone() - - if self.cfg.asset.apply_humanoid_jacobian: - torques = _apply_coupling(q, qd, q_des, qd_des, kp, kd, tau_ff) - else: - torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff - - torques = torch.clip(torques, -self.torque_limits, self.torque_limits) - - return torques.view(self.torques.shape) + return torch.clip( + _apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ), + -self.torque_limits, + self.torque_limits, + ) # view shape def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..cdc4334f 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -248,7 +248,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" From e52208f5527c99849938726841ebba87a4dd01f6 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 16 Mar 2024 12:14:47 -0400 Subject: [PATCH 14/98] add exp avg filtering --- gym/envs/mit_humanoid/mit_humanoid.py | 25 ++++++++++---------- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 ++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 10169afc..67b21868 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -2,6 +2,7 @@ from gym.envs.base.legged_robot import LeggedRobot from .jacobian import _apply_coupling +from gym.utils import exp_avg_filter class MIT_Humanoid(LeggedRobot): @@ -12,19 +13,17 @@ def _init_buffers(self): super()._init_buffers() def _compute_torques(self): - return torch.clip( - _apply_coupling( - self.dof_pos, - self.dof_vel, - self.dof_pos_target + self.default_dof_pos, - self.dof_vel_target, - self.p_gains, - self.d_gains, - self.tau_ff, - ), - -self.torque_limits, - self.torque_limits, - ) # view shape + torques = _apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ) + torques = torques.clip(-self.torque_limits, self.torque_limits) + return exp_avg_filter(torques, self.torques, self.cfg.control.filter_gain) def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index cdc4334f..d442216d 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -111,6 +111,8 @@ class control(LeggedRobotCfg.control): ctrl_frequency = 100 desired_sim_frequency = 800 + filter_gain = 0.1586 # 1: no filtering, 0: wall + class commands: resampling_time = 10.0 # time before command are changed[s] From 54b826db9475061b01139dc58b6d0e0d930dedb4 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 16 Mar 2024 19:00:50 -0400 Subject: [PATCH 15/98] add oscillators and phase-based rewards, some tuning --- gym/envs/base/legged_robot.py | 45 +- gym/envs/base/legged_robot_config.py | 2 +- .../mit_humanoid/humanoid_running_config.py | 2 +- gym/envs/mit_humanoid/mit_humanoid.py | 91 ++ gym/envs/mit_humanoid/mit_humanoid_config.py | 157 ++- .../robots/mit_humanoid/friction_model_L.pt | Bin 0 -> 13519 bytes .../robots/mit_humanoid/friction_model_R.pt | Bin 0 -> 13519 bytes .../mit_humanoid/urdf/humanoid_F_sf.urdf | 16 +- .../urdf/humanoid_F_sf_learnt.urdf | 996 ++++++++++++++++++ 9 files changed, 1242 insertions(+), 67 deletions(-) create mode 100644 resources/robots/mit_humanoid/friction_model_L.pt create mode 100644 resources/robots/mit_humanoid/friction_model_R.pt create mode 100644 resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index e02507d5..1bc4623e 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -532,6 +532,29 @@ def _init_buffers(self): self.num_envs, 1, dtype=torch.float, device=self.device ) + # # * get the body_name to body_index dict + # body_dict = self.gym.get_actor_rigid_body_dict( + # self.envs[0], self.actor_handles[0] + # ) + # # * extract a list of body_names where the index is the id number + # body_names = [ + # body_tuple[0] + # for body_tuple in sorted( + # body_dict.items(), key=lambda body_tuple: body_tuple[1] + # ) + # ] + # # * construct a list of id numbers corresponding to end_effectors + # self.end_effector_ids = [] + # for end_effector_name in self.cfg.asset.foot_collisionbox_names: + # self.end_effector_ids.extend( + # [ + # body_names.index(body_name) + # for body_name in body_names + # if end_effector_name in body_name + # ] + # ) + # # ---------------------------------------- + if self.cfg.terrain.measure_heights: self.height_points = self._init_height_points() self.measured_heights = 0 @@ -994,11 +1017,11 @@ def _reward_lin_vel_z(self): def _reward_ang_vel_xy(self): """Penalize xy axes base angular velocity""" - return -torch.sum(torch.square(self.base_ang_vel[:, :2]), dim=1) + return -torch.mean(torch.square(self.base_ang_vel[:, :2]), dim=1) def _reward_orientation(self): """Penalize non flat base orientation""" - return -torch.sum(torch.square(self.projected_gravity[:, :2]), dim=1) + return -torch.mean(torch.square(self.projected_gravity[:, :2]), dim=1) def _reward_base_height(self): """Penalize base height away from target""" @@ -1009,11 +1032,11 @@ def _reward_base_height(self): def _reward_torques(self): """Penalize torques""" - return -torch.sum(torch.square(self.torques), dim=1) + return -torch.mean(torch.square(self.torques), dim=1) def _reward_dof_vel(self): """Penalize dof velocities""" - return -torch.sum(torch.square(self.dof_vel), dim=1) + return -torch.mean(torch.square(self.dof_vel), dim=1) def _reward_action_rate(self): """Penalize changes in actions""" @@ -1025,7 +1048,7 @@ def _reward_action_rate(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" @@ -1039,7 +1062,7 @@ def _reward_action_rate2(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_collision(self): """Penalize collisions on selected bodies""" @@ -1064,26 +1087,26 @@ def _reward_dof_pos_limits(self): # * lower limit out_of_limits = -(self.dof_pos - self.dof_pos_limits[:, 0]).clip(max=0.0) out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0) - return -torch.sum(out_of_limits, dim=1) + return -torch.mean(out_of_limits, dim=1) def _reward_dof_vel_limits(self): """Penalize dof velocities too close to the limit""" # * clip to max error = 1 rad/s per joint to avoid huge penalties limit = self.cfg.reward_settings.soft_dof_vel_limit error = self.dof_vel.abs() - self.dof_vel_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) def _reward_torque_limits(self): """penalize torques too close to the limit""" limit = self.cfg.reward_settings.soft_torque_limit error = self.torques.abs() - self.torque_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) def _reward_tracking_lin_vel(self): """Tracking of linear velocity commands (xy axes)""" error = torch.square(self.commands[:, :2] - self.base_lin_vel[:, :2]) error = torch.exp(-error / self.cfg.reward_settings.tracking_sigma) - return torch.sum(error, dim=1) + return torch.mean(error, dim=1) def _reward_tracking_ang_vel(self): """Tracking of angular velocity commands (yaw)""" @@ -1092,7 +1115,7 @@ def _reward_tracking_ang_vel(self): def _reward_feet_contact_forces(self): """penalize high contact forces""" - return -torch.sum( + return -torch.mean( ( torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) - self.cfg.reward_settings.max_contact_force diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 2955f5a5..221a6afc 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -163,7 +163,7 @@ class asset: file = "" # * name of the feet bodies, # * used to index body state and contact force tensors - foot_name = "None" + foot_name = "foot" penalize_contacts_on = [] terminate_after_contacts_on = [] end_effector_names = [] diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 86a113f4..fd21c9b4 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -142,7 +142,7 @@ class asset(LeggedRobotCfg.asset): # +'humanoid_fixed_arms_full.urdf') file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_F_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) keypoints = ["base"] end_effectors = ["left_foot", "right_foot"] diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 67b21868..17167286 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -11,7 +11,29 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + self.oscillators = torch.zeros(self.num_envs, 2, device=self.device) + self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) + self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) + def _reset_system(self, env_ids): + if len(env_ids) == 0: + return + super()._reset_system(env_ids) + # reset oscillators, with a pi phase shift between left and right + self.oscillators[env_ids, 0] = ( + torch.rand(len(env_ids), device=self.device) * 2 * torch.pi + ) + self.oscillators[env_ids, 1] = self.oscillators[env_ids, 0] + torch.pi + self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) + # reset oscillator velocities to base freq + self.oscillator_freq[env_ids] = self.cfg.oscillator.base_frequency + # recompute oscillator observations + self.oscillator_obs = torch.cat( + (self.oscillators.cos(), self.oscillators.sin()), dim=1 + ) + return + + # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): torques = _apply_coupling( self.dof_pos, @@ -25,6 +47,44 @@ def _compute_torques(self): torques = torques.clip(-self.torque_limits, self.torque_limits) return exp_avg_filter(torques, self.torques, self.cfg.control.filter_gain) + # oscillator integration + + def _post_decimation_step(self): + super()._post_decimation_step() + self._step_oscillators() + + def _step_oscillators(self, dt=None): + if dt is None: + dt = self.dt + self.oscillators += (self.oscillator_freq * 2 * torch.pi) * dt + self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) + self.oscillator_obs = torch.cat( + (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 + ) + + # --- rewards --- + + def _switch(self, mode=None): + c_vel = torch.linalg.norm(self.commands, dim=1) + switch = torch.exp( + -torch.square( + torch.max( + torch.zeros_like(c_vel), + c_vel - self.cfg.reward_settings.switch_scale, + ) + ) + / self.cfg.reward_settings.switch_scale + ) + if mode is None or mode == "stand": + return switch + elif mode == "move": + return 1 - switch + + def _reward_lin_vel_xy(self): + return torch.exp( + -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1) + ) + def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp return self._sqrdexp(self.base_lin_vel[:, 2] / self.scales["base_lin_vel"]) @@ -52,6 +112,7 @@ def _reward_tracking_lin_vel(self): def _reward_dof_vel(self): # Penalize dof velocities return torch.mean(self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1) + # * self._switch("stand") def _reward_dof_near_home(self): return torch.mean( @@ -60,3 +121,33 @@ def _reward_dof_near_home(self): ), dim=1, ) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1 + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return rew_vel + rew_pos - rew_base_vel # * self._switch("stand") + + def _reward_stance(self): + phase = torch.maximum( + torch.zeros_like(self.oscillators), -self.oscillators.sin() + ) # positive during swing, negative during stance + return (phase * self._compute_grf()).mean(dim=1) + + def _reward_swing(self): + phase = torch.maximum( + torch.zeros_like(self.oscillators), self.oscillators.sin() + ) # positive during swing, negative during stance + return -(phase * self._compute_grf()).mean(dim=1) + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) + else: + return grf diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index d442216d..826480e7 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -3,13 +3,15 @@ LeggedRobotRunnerCfg, ) +BASE_HEIGHT_REF = 0.80 + class MITHumanoidCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 num_observations = 49 + 3 * 18 # 121 num_actuators = 18 - episode_length_s = 100 # episode length in seconds + episode_length_s = 5 # episode length in seconds num_privileged_obs = num_observations class terrain(LeggedRobotCfg.terrain): @@ -75,13 +77,13 @@ class init_state(LeggedRobotCfg.init_state): ] # yaw root_vel_range = [ - [-0.1, 0.1], # x - [-0.1, 0.1], # y - [-0.1, 0.1], # z - [-0.1, 0.1], # roll - [-0.1, 0.1], # pitch - [-0.1, 0.1], - ] # yaw + [-0.5, 1.5], # x + [-0.55, 0.55], # y + [-0.35, 0.1], # z + [-0.35, 0.35], # roll + [-0.35, 0.35], # pitch + [-0.35, 0.35], # yaw + ] class control(LeggedRobotCfg.control): # * PD Drive parameters: @@ -97,22 +99,25 @@ class control(LeggedRobotCfg.control): "elbow": 40.0, } # [N*m/rad] damping = { - "hip_yaw": 5.0, - "hip_abad": 5.0, - "hip_pitch": 5.0, - "knee": 5.0, - "ankle": 5.0, - "shoulder_pitch": 5.0, - "shoulder_abad": 5.0, - "shoulder_yaw": 5.0, - "elbow": 5.0, + "hip_yaw": 2.0, + "hip_abad": 2.0, + "hip_pitch": 2.0, + "knee": 2.0, + "ankle": 2.0, + "shoulder_pitch": 2.0, + "shoulder_abad": 2.0, + "shoulder_yaw": 2.0, + "elbow": 1.0, } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 800 + desired_sim_frequency = 1000 filter_gain = 0.1586 # 1: no filtering, 0: wall + class oscillator: + base_frequency = 1.5 # [Hz] + class commands: resampling_time = 10.0 # time before command are changed[s] @@ -138,6 +143,7 @@ class asset(LeggedRobotCfg.asset): "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "mit_humanoid/urdf/humanoid_R_sf.urdf" ) + # foot_collisionbox_names = ["foot"] foot_name = "foot" penalize_contacts_on = ["base", "arm"] terminate_after_contacts_on = ["base"] @@ -148,35 +154,88 @@ class asset(LeggedRobotCfg.asset): # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) default_dof_drive_mode = 3 + fix_base_link = False disable_gravity = False disable_motors = False apply_humanoid_jacobian = False + total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): soft_dof_pos_limit = 0.9 soft_dof_vel_limit = 0.9 soft_torque_limit = 0.9 max_contact_force = 1500.0 - - base_height_target = 0.65 + base_height_target = BASE_HEIGHT_REF tracking_sigma = 0.5 + # a smooth switch based on |cmd| (commanded velocity). + switch_scale = 0.5 + switch_threshold = 0.2 + class scaling(LeggedRobotCfg.scaling): # * dimensionless time: sqrt(L/g) or sqrt(I/[mgL]), with I=I0+mL^2 - virtual_leg_length = 0.65 - dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 - base_height = virtual_leg_length - base_lin_vel = virtual_leg_length / dimensionless_time - base_ang_vel = 3.14 / dimensionless_time - dof_vel = 20 # ought to be roughly max expected speed. - height_measurements = virtual_leg_length - - # todo check order of joints, create per-joint scaling - dof_pos = 3.14 + # virtual_leg_length = 0.65 + # dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 + # base_height = virtual_leg_length + # base_lin_vel = virtual_leg_length / dimensionless_time + # base_ang_vel = 3.14 / dimensionless_time + # dof_vel = 20 # ought to be roughly max expected speed. + # height_measurements = virtual_leg_length + + # # todo check order of joints, create per-joint scaling + # dof_pos = 3.14 + # dof_pos_obs = dof_pos + # # * Action scales + # dof_pos_target = dof_pos + # tau_ff = 0.1 + base_ang_vel = 2.5 + base_lin_vel = 1.5 + commands = 1 + base_height = BASE_HEIGHT_REF + dof_pos = [ + 0.1, + 0.2, + 0.4, + 0.4, + 0.4, + 0.1, + 0.2, + 0.4, + 0.4, + 0.4, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + ] dof_pos_obs = dof_pos - # * Action scales + # # * Action scales dof_pos_target = dof_pos - tau_ff = 0.1 + dof_vel = [ + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + dof_pos_history = 3 * dof_pos_obs class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): @@ -184,6 +243,7 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): runner_class_name = "OnPolicyRunner" class policy(LeggedRobotRunnerCfg.policy): + disable_actions = False init_noise_std = 1.0 actor_hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] @@ -199,26 +259,27 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_obs", "dof_vel", "dof_pos_history", + "oscillator_obs", ] critic_obs = actor_obs actions = ["dof_pos_target"] class noise: - base_height = 0.05 - dof_pos_obs = 0.0 - dof_vel = 0.0 - base_lin_vel = 0.1 - base_ang_vel = 0.2 - projected_gravity = 0.05 - height_measurements = 0.1 + base_height = 0.0 # 0.05 + dof_pos_obs = 0.0 # 0.0 + dof_vel = 0.0 # 0.0 + base_lin_vel = 0.0 # 0.1 + base_ang_vel = 0.0 # 0.2 + projected_gravity = 0.0 # 0.05 + height_measurements = 0.0 # 0.1 class reward: class weights: - tracking_ang_vel = 0.5 - tracking_lin_vel = 0.5 - orientation = 1.5 - torques = 5.0e-6 + tracking_ang_vel = 1.5 + tracking_lin_vel = 3.0 + orientation = 1.0 + torques = 5.0e-4 min_base_height = 1.5 action_rate = 0.01 action_rate2 = 0.001 @@ -226,8 +287,10 @@ class weights: ang_vel_xy = 0.0 dof_vel = 0.0 stand_still = 0.0 - dof_pos_limits = 0.0 - dof_near_home = 0.5 + dof_pos_limits = 0.25 + dof_near_home = 0.1 + stance = 1.0 + swing = 1.0 class termination_weight: termination = 15 @@ -245,13 +308,15 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): schedule = "adaptive" # could be adaptive, fixed gamma = 0.999 lam = 0.95 + discount_horizon = 0.5 # [s] + GAE_bootstrap_horizon = 0.2 # [s] desired_kl = 0.01 max_grad_norm = 1.0 class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" - num_steps_per_env = 24 + num_steps_per_env = 32 max_iterations = 1000 run_name = "Standing" experiment_name = "Humanoid" diff --git a/resources/robots/mit_humanoid/friction_model_L.pt b/resources/robots/mit_humanoid/friction_model_L.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1d680d5aa88e8f496d957fa7db1640c98ee00e1 GIT binary patch literal 13519 zcmbVz2{={X+qQX56j4GlCQ;_$>}MU4%A86nl1Mah(4KQDOb8#{Mg!LYGF$_(v>Uup)G6kbl&|<&j|# z{wqyoc%lXdK9c>Q$o|NHMGJXizaTPBOTz-9rvKU~!xQ%r?vEMIlkkz~uLD;E1=x<~ zNxIz=j^as8a}jbG323K7x5_G9L00-;<>Jf;ko^z z&VSj$sDOd2=ehrrIUWW+{YfUo^Nit5`{!^g7e++(XFSi#ML5z&q<`1a3B2hsycr?? zY#2!S$gtq3$k0X0C-7#*@VtL*|0mFtH!Ftc^Y4MCyxB3lIll#(@#eRv$VglbVCp@HK0Kel~&A=N+HzR7qe z2c^GS{;%6WfcrZ#rcOYHt$c}MzOLNGTak5LPy{x*xwvfGXD}B{gw(4OFzQnlO`Ck1 z+CG*im-%w^{M0HkUvDrOvGhCHbx_!TW9l@4ynDPLuyi+hwCp0xpJ-#>IPU`u{4rgy zxceljeQHA6lw$3zq`#pzPel+I@RRUcVo9Hh9DTI%6#bZU7=`*DFizX3V z2t`-VcU1k!XrgtI%dKcQMpUoO5Lh033v$QwxCf^^K`};~oDn8mUieT5Y;@#yTJdOr z|73a~?FA{=Jy|fSG#P!5J*A&t$_b>NuO@-bRM1>xDo8aMOfPt7lefNHZeP$@!Gj03 zU=sh7c5PLn))k^eNFrMxHf$?-U$9JIvT?NF5`QE8V&o^-7R-{;O*_D1_hI@aA)cn( zlj0s#tYbnYy~&Ki7I^vmI-@&kE!{k74?Yt)Neo}TBz?P5$-P;p=≷s5+#I3~H?* zy@lnp#73LjDK179Or5}$oT5jrUEg8C0-xY6R$o*|lASErB zF=;RDYC1++Hq2&HChin0PTed>Dd3Ymkp+xT?uC_&)!z0_20 z5%GDMFGzYaX0tfA zTcA^uK?=qTATkZ<+t-ey!z)!l^$N%t(K&S9qFxdk8ckpOjuFI$XOb)3=A`J&A>tGt zM~i>x3oeeVfQk7sWcR{4GO>F*{Z^{)60X4`!<*LX_&GVKIm@usA;>ku8G=1jNSb+XM)_FRLv zu`QA--H7(VL1a?=2zV)|fo<9PEld2z32uDv#Y4K+$>h7@g6t)&f@=320-rp8I{0-h zd8BxQ^i4Kysq!wzO`}_=qwh-+RiMUBy1PkWYA8cDd+(;X%ro-*oUmZ6&>I?T^&0#i zdC`yaW@6`$yVOo~4k*LPCxRlvP2{h4HY>I)({^GIA;EqM{6K_9<5iSMqb2!zMSG9&sXlbo9K zg4eEZXqWN_0poWQ)i`SeuzZ&wE9VC>2x}!(3X0tDAP~6Lj1#0z5(`-6@m>#WN?@H24 zbOm8;>fGu}*ZD5!Dj1VHl3T1ko4%{{g>IR(C{Iq2o`70f(cmZWATgxjLLu21s>=kA zeMB2eWdtu(`pBl!L%0jG5^2e}*@7*#uknMG23K&>kh^gH0_M{o1%aYAQm4&dh)|{v z5sTBM@0LrEK98BiBwK|XmkXrwBe+z(kW1XEx{1&Cy|nYqBD$KY3H0Mr1#!>p1fN!0 z3Z@4b(e*bissFcUgk$mq%g)6M9y?z^(bH+9Y`m_$x$j8siL#AkZN(C5`DrKjXt^tQ zett3c+splsY~IMa`k&w)zkeAV#be3iy^eGZSw-kADf{XJvjif8O$6JQXjADW!|5rp zSM>Z%XEJ|$1$WE$(>VP>2#Ye&(Du}xyVkIW+kNLU#?Qzi{HzIF;fOYF-w6qOBfHD| ztsWKJV>zDy{l}BBMT1);GUUj|F&~KkI3vOzmq%t@RA~9C*Fmnh=unY6GUQ2j9T^cM z+tRx-UvTh9I``EIZ!+Eb9jR&@O#O2C^!d*svhL^?n6YvSzPodd8rh#FZ5&BKp_C#y z7@A6N6%P`epKV2oR?5=@CzObkXC1BidIaBm?j^eU1_C>1BpIU@QQ689WU(rTo92=z zaJnSSo#OwRjd?emyLrDoH~Dotjg2U#9w8cB-69#H3(?#Y2WC+2T3=vR0f4aOY}WVoI+y?51|l}qh|$9tR6 z{;3?X%&@`D&tEh9rG%i@LV=u7t%T^w`xwWokNBT|>VnN>CYzzS6&+;rjs(D7W0lpm_{m{pSzdeAt+KW^^S4oxB1DNkd^ptQcK*tBu`U zdW@uXsxf7!^}wQa4jCc;5*saYn0aTubL`5O)5WhXX~E5G*g0h{mN*!Lq+43l=$%t&rk615slSgsqWzdX zv7?JVbx(m^ym*wjU%>pV+s9lPX$$JptD&UqF(Y-7CAf10xAn+Jl)Sr~o{||04LiCZ zqDD{S^=Um{yu%9qO*9csB&Cn1ylFlfuz>0OcutjGIIklbzjnHS{?$&|HO|saQ zT8v(@M(l{usr*xpDn$Iu2x>909xa3R;lvw0{Q1)LaQwtxkQ_f7_iagp&x`J1>Ru`E zkafVXVqRd>kPg8Ow$K;Wh~oC8*kJVp*E^=MPmQkg-4>5wQa)yZg49D8d14oN=b27j zYNO~Jz71FT(+s9&aS;`5?E&>c_VD>-F1Ms}grH2Tjj8qvBcqO1lctpYnB`JI9fO;R zbmj=bg*i*;lLHBYmrvcvq5VxTeWDVr5Bo;L<}gH6mHQAZxArFRi^oVlEC+mTL7FifrdOPPfv*3#)_vJYQRwGUezIHDm2e#V6@?-np<%ueY z^=NEPn0f#!y~3!Sv>N)ZPk`oAKd8lpR+N8QPmPBP$dMzr=(NXoXi?@hmiPWFHlKB7 zMj9)lN1O=TRDX?0d~%s_<__itp6+G}qwnB`-gA)Y5dg1rjX|q<8#v!vgd);1Pg8T)h4EI?JGEJkp!x~ z`H)g_1BB+JQ?E)B(gnHXrC=5&URA=W)03&k)i`8Dq;b*eX0~1O9Ge~#OWf2sVC0bp zF6T2j#pZ9wuH6?f@xeJ}Y0@WH1Kp5codAZPB2l=<2#l63#NDfw)7g<6xH-d*zdVYi zhwEf<8@GgtTGkV(rxJq8Su<#+!ZcjeXayc$-gCBX_eU!zVOE;P;%YMi)*e1d#N0=a z)S-5unxDY7xzsSBnW`|Ux|^{szRju)3#H8D<;c}VaB5hGA|8^|c~2Ui^{58-&{(;P-S%WBtzLN<+9Ge@hU_gE{%r$$q@e&`jS{0p3Uld-xNh{x(qg3@ zs!4PHAi=&IWBQ~n9Nvf(Frv;@C>G~JPQ0v!yKnXrxlQjOYT7w`Gw%U}gf@`y_tG?c z@_swT@CLm0K$65PC19wrlLh4uEDc$Jm3KcdQTIfMe32;Wd}>eBr&ZwzUo*n7Fkq`c zs)BgkQ8uUOGw%K1OyxNK%sAb6wqkq;?b;UukM9>l*!9~mYkwi8y-k)Jd?|)cKI<@dig@(v+m$f#=~jNzxChMc+1~V1N)%p9 zTZFU9rjWGJCrItoQ{XZ$moz4vzz|DK?#;4tn!I%H10Yq z&Zg(p)CFUe!&upME#!yE2|S)1&Mh;TXLlrg0^TaBVyk7d(9z?A-Q%~z37c}3X!bmZ zot2hEbCfB$pI-zWFVdMgZ+9`_!+deDY8<&IFCkFBco^a|SXT9>2430SjQNYjNau^Y z(3%(oH(iY&c=ZC3Xs1nD_PD?i83B>5>EWD}oPyMd7L=fnN-1g< z5rSp&m|Rl>I#;@ic$7?si2Oo^ZIwd%@3V+zU^;$qCtmca% z>S6i;%=VN}@vqn6yaL7g`<~F}cLdwJ7{Zx#3+7GMB{`PEXu!;BqWD&c+P@iw^985i z&fZlpeeYd-*SrcJE)1ha4l-OhK_C%rx`fXbo+FPgs?Z@7{usFWG$S0pjk&+l9<$Gc zLeJ4b0=%0K#jP#y-uMu`zN;0fb~HQ0$Ocj?elSTd5@2`3S}?29AfmBH@l|vRp60eO zk zL&D2X)8wd|pJk2 zbqKkyq)Ckv%Hdp@FT4E95LU0X6g0+s=lfh$2fvI**lR8)$a3gILHl&P>aIu^xZa>A z@9IL}_5gU2TS7jsy^bSXZ<35?7uc%d4JW1A*z;EmVe%*ms2w3fcc${$H%;b(YU#t& zI9{IIT5L>{{bA(Z{tJ+KR?XC%x3k8M#3WT?9XXV-;vh6EqN$%_{8WyZ&0>+p8)`c%MPI5d}}Ygc1R z%zidqY!^RXYCD^MP81~!M&OLQ{-{0VHfvq}81pp0Gq)c-f{lvGbmQ~oc+p#dY~AyM zRaV&q)3?pV=ej9mhS+doXnzrh$~*(5*ey)Qa#b=#u$`t9$%CDS1nzIsB-R$gX<%t5 z6A>WJ!cc2)eQwTwwM~N4+$Tg*Oaf73SQ2ui9MNL=H%!VdMfTEely~&8an%<>OCfVw z^n@~sN^{V}LzI}tu3$9}=CRjxlmy<-`y2gJH!31#M7`W5kd?QF(%8z&n55E(4wWu& z`%okZZBrsXCP&d^7UD+R*<^IzQFwN#lui7a4n-4lm@!xL*?aSxN$Jyd za79lhrMB<&A+gap27(2lOe6Vkz|b3TKKw_3;wq8INe*F zL^h0Krn$M1PTy5Hr#=$p)@9@PD?gZHO=5KXm0K__@FWO*ZG*HH4rW`;q#Y-;L29fd z4Umvv6!wV1IW0H(P(*{W)mmioS#4Uvl|@#M!*LWS(i4}oh~oCsu=h|4a6(!!&gm-Z zM3taWO9ieHpGd+MmqM_p8|euzz^7=4h*F|w&qN-{$Q%*H4UGeb~BDDL&(vj2Q*Z5F}$`F z#;Jj&OljT_wskcZI|^peiQb#}QJXH1H;!6@i0l^_G%1%(=>AR{A0LHWR+|pZ$soyE zDm4Gc5~8>|2Og*svT~s|VwxAx+t*IJwp_Kd_Di5WzCYoRYdpQ%<;`rhj{#TaGJLWR zA|DMYiC*SH2K@)7|fw5buTf ziC4f$Y}(4iz7|5AhllJYd9Pz%6_eu6h@#IfGjh40W5#vWOm$BYZA=VyL607BjkxOHm_!{iBIK)=n8 zBYOtCkAdg6C_K8IdAGZ%br`cc2w7RGMXCjOf%A5bb) zie`K@!~-GIF=?M0xxK*+8gkc=vaK~tzK#lQh_1k>T0e}~F3rj;4u(mNv#4g+DQ5H5 zr9{8X5feLY$Z^X9?5II=Ah7-mM@_K_*-Z?ly_-ocJM{OdSuH{rbb-#(n`qf7%slXX z%g$^WOv>GInC+=aR;R4vnC%Alz{oQL14<&8yZLHps#3^kWh_D0eS_FFsnWz`nh~a% z6kvh52o&GyWHJ}kL&xqxthw21e0+TpuD?5m`5Zlgn)OL>Qr~J3vx-J$QlK2(S-J>S z1TkzvYBkzCi(yrA&8VxN47Jp;AkVfX<1^a^h&nl$`BIwz&H_d9sv!X{k#tr@Vi>ib zy$d25rja@Bh1hfQbaU3Xy%6!L0&hLNjO`kOsQ&SFSakIxW(cLzg1|Ps-M5CKK!)z! zDNdr~laUv)8)}RO+3BfB)A-^ctXP~r-Af%g9Qia@T_HqgZkj;9_65M)o>B-eI6y9N zZ-YroIy_f!|mQre`;w5!zAC>MVC8H8OM1X_^wg-BN&!mq#=8S7V^Zb{qa2EJflT zJz>Ad?%@1PD`bp^`@=~|esg%Q7}*ed7w7#5W}OyAf?kUxewHa?8g@2kEuXgvSN6A6x~m z+cVkh%4E122yWF85{(de!&Ni}b-X4mwd0@oi>R6Ke$TVr<}zmF3lAM4WS$Y5cN z)|x~&gexJir(sQD5$tUsV6jaXJWWr*qcP5mnNJp+aE}J(!CA0cClG3SdgzrU1*p%_ zCELV?(TvKqF!GB%J$!I1!VMdcu=;mI{n(6^yl^Ci&e;%LlF+=_dO*0IuY<|Jj%R&e()WInSq=&H&0n5yUd z*cBo^=vv9*yI}{xt9ms&&3Fqh^gFS*?+Aa$)HQgYjjEeH08?GlWK# zoFt*|W>M!{2ar!^(F6N-tYQk3o70wvE2&@lV}+P*(wcK~0r(?xG$zHYx~sY-CrVhXoX%bEHq z^Wflucg%ybZBV&GjSSzS$!Z3y@As-?h{-h%_N+=QtEbt+=Z>8Swa0TXEtvlpu?hdYEQekwLpO zO5%{Da2*vt-2}-lU;f*XgXrA@IgI|^a`xstQ+$}D033r(u%F+~wkSSgoY%@T{53!M z89QxJ_qsA3aC(f>ulBOi%bszjhHqq*j|~SLx|A`~lEoJH zrmuen0v&sj+fYP(2RmR@UIScJDJ6+(r0((wv4YBK1(29nvXB4Ar#x zx~Fx>qYAj_!j9eo}-!J@ND~d{ldaL-OLG zPQjcV&DUcUK0k$3zJXXBXG`9Ch!Bqr=Fs((P?303CQkYt>si4ecXKt^sjbl{YCM4) zdwKzaB!e)$u^9#L#?$gy44f~KMM1MWmXC|4B5JaP*ESWGD;{Gn9>0J!r3*+)nj-04 zvxUeqBUvY%Tym@CIyl&;L+7PB9PgBdOX|l^^@o`_c-BK^VP7jFBc}|Jo|a%F)xw4x z*<=?yDw~d(vyaqv59`lj0Uf@=2V_+&(KO~DJ1ze(N;(#^pm^!YCsX&cQmM!G)*rgLVcz zFnA{0UH+DpG>HL|i$I@-YSTkUb!pWh1(J396uaw{B&rRc!N%k-!@Hu=WJ8W0tEO-o zuCS_P*vkh@goZv&k7{5)-?@T;gVJHpIc?TLs{i}-_ab(2xE$7_9R>Gjrllmx5DFNd9>szMIfr`5L8$PWQcs$9Eg84)0Dm5|Lwj810qk;p5$5MChdo z6`F5Om8K|D*(yzba$P<6#=B6HY0Yps+l)AvKjMp5db1|_<(wb=?*d2prW z9HtnG5^7#TfAo7G!r9~D%eq`NPA}smezKytMGbU3<;iG&4o&9lg=Fg`WS)TsNiMMf ztCL1VWvVRoU490>&#=ef{P*m}#}52fYik&kGm1LoE@I4P93^NzFp{R_?$Z+#F|lJvEVEPj>Wjn}?Z;QB{9+1p1>VAMP_zWb{WWW(VWJTK!win1l(%eNf>bFAQ} zgfsng_Z;dfuco%#G}OEnO+xhQKxV;c8o7-_9`wW`igY&fx}IQ~?|G={>%!K9d8qMY zHze-YWe?riMsuH;;m=TiwsI-ZRZC*Y$*^i>kogBjss9`H>xm8Q;pj8)dYumL89$7+ zHKp;l3?4>XjGTy7;b1yleF$qkRf8n1Du&T1BS3aVAjs^P3^6YoL8)mht*~>UW4!~= zWWq@LR{tW-Xx|8nEQBaslY*<PDZ&>Z?QI33ehTXW(A;<2siGGa~#dVSS{!h43qS?D@n zu`d;xS8NBD&zUr-iiJ02S@23EAEiyy$#G?GvN?}n!$m!Kx9}clSs#bG1bw1u8%s;1 zG^z9B=kQ4?AHS*1#~k%7gx{D;P0ovxB<4I_`|c#R>4<}(s1Q*}(xg8%17KI}dN@Dt zC)U022JtmF*`?P!VP=&R?M#>g>pgYo4NX@xRhvw!+g~P&l?0!8BGH>y0=pOz9YBrt2F1!q{&MJfHu}z$>uG>KRuq>|b5hBA} z@;PCbQXsEuFWwsA$U2ccaxHigXZig(=r3l2O^-~;NdLnb?WHO}0$>0?yS)wB) zO)snOgHLA**qe!!i`Z8D&UJMz1ns8twACsT<;Dyokmcw|jp~pIIh@CMAL!$! znO($`vxTHly2MWO@({YNBOEjJS&VpA&(NG+IBw)jkBv^k@+fuq8ZjSs^q%C%&s)j3 z46DU4bJR)b2}?R;)s+z|#U7kt z#;Og)Yf(WMv@-~<%p6C8_LL&AjfV_H4V3&)%rNpH?6oz?tVtt>j2^B`vkh{v+Ttyz z{hSYMei(?hN4Maf0%tTHkqvh)XrY^J0#hKwMWs_2K!51a8NMfmR7Hn!=+PVm&;fHS#uc;>Y_Lvv@K#poV<-Y$f9CaE#^SE+#C>-{J!d<+)u ztz$Tk6fxG!mr>ty3@?x=BudeT$acR)y^un5D-&b;5_CwGp%|XwCE#nDZP=|mmx*Xn zpq|Q?!Oq{9<=q#D7ZplW@=Y>MOS{E5oA;qkk2>v3-UT-KF32QSFg6)a(L^T*wzV&Y z-0-EOvQdUEvhyOh?+b8wbTFWoYX`&8%(8)uojjlBN`xp`;w*V|e&%hhrS$1oT8*BL^9a`xo7#5y@ z-@nfWo9`#l(4C9x-hRNM5+`QBKUgW-+-mRChnKp>cvGst0V_TUePi#Q5fW?Fx=QsoU zA1AGt`M>Z#EC=+@k<^6EEy9d~)*Lo-fh^wdlE#T)6WNfRO1LiiIXiJYWyL!CIK_p7 zn#Hd5aqw{(gz6pQZ#1fA&feJ1YMVKO64S@O5#H}9Dym`kDIKP(t$`yfqzpN4J6OBg zdz|KRo7lp1V@8%u=B$66%Ctn?YySF40ltlpVmg#%|di&PCM?&=}{#bT^M< zAN}+Oeqk;%%2Wb8bBowhPZanGjrM3SKY}q|o5yMmNyqraROpvr@-{8L4`o zJ?#6e+3sL}yZly*Q)a#As4DDc8}9P$+Ey=P&7}+R$m0suHrJ0;SwL}DSQr0Lr4994 zVZsP#4m`UYhHsVB__7Pq@PMNl|74UKeBMyaUOYFFny4tkz_ir!2AAaS)c_7BFxaSPIACEIy_mh|dKMnaa)!#LnM;&KtwYGDt zZBp2ux_3BZlcLy(LryhMj;&zto6F$z_bY9ENjpF5wjyWN8$*y=ZOa#*GnY}Go5KhX ztzxs(hOvheTKPSBoA?FF#mwzn9qbV1C}Vz(%f{MWX;zN<#6FNd!YM!I&8ba#{!jah zz2vC7QKa9$>o5O-e`ht2zn#pZ=A|yljLY#|c8wE;^RK^8{ijXvLuTtL_4|PX<$vZJ zEM)TEW4it|BHhVh?EiD1_8+r0{_l#J%s>mgylc_Se(^#B#nVN2p!)AQA-|6pA23<} zT>hRR@;CbTv1#&uK)?5oK>rVIL)|Hl4)AN&)0Y205}i@&kI=ir~%Oyj?>mVaY^&$U0X zo)iAUTK$dvJ#z;C@&A_ohW%G=db&tR82u|qL$v>#{Y;pKK>1H2aTbtw>N%M0j{`P|Tu;rO}dZQ44~jf)_X} z3W|~B3F&$8gmrx-`l-Xm8!ro9v@kS=ClVAB(a&l;FE(aTc(mlasNngri^3PoiwRyD z9T7Dz&PqQGdLOlbe< zLd8OV(T(TH{!TZ{Su=(w_YYe#Pu^KLnWylJNil|}{LVGP zMMx}$H}aP>%@m$$zrucLYQG58V|b%HcpAT?nf;Qc`3IdjZ}jhUV+N#Y{Y#qmfHa+7 zOk-ns<2-o(y6xuwB~9-SE(@Oi?_37|NHhFzX-2=K8UG?QiQ$=g@W%g=X8uc>*&lS4 zJoDe_EC!@m{!7|~0clphm?p;XtUY*>e%&>T|B`0&2iF81=XWl~xip3c16<nZqA!Ry@bw*(N)S$MBr~VN2#s8BjL$7t^#Dp0fvUdTavE zNV?eiKnn0qWg=`o-m8t|=a_acH)MSmXz=u~uVsam0*q{aUjsR(ohO59@ zb{|wkjumu^o<+OM59m_8iQIW*mV!@{ys7TtB(kC2n`+;3<-RtYE*P6MnkI$XQn^#^ zn10xf+dIKTFiyK1cWai>m(NaN*$E+fW2OmdzOtL>TkvRQ&S-(#x3PGwK^g6~R@3iN zR-{@Yht{MOL*SM}bSY0#FvHWFd{jD1#Sb(H4h-VbDd!|e^BSzvxN499J`3JnPp$fw&@WXos? zK~{kcG5q2{bmNW?Cf}WQy*t9^b-cm!pi@w*D9W`k(-!cX8i=1$6U~sY=DK&}QTMts zdd+^YZSmYFC>yKIJy=nU-jZU(?{gd(oo!5>*lBZPMouO>E~b&;_grkZF29cpoLxzZ z{$hbq&U5PX-k2+>K1`+@&t{9(Ta%WqN=mC<;@j$z?C0E_^mgi2u3K9PU8OoqurY8N z6?&{kA0`iFYNj0)9F>bAA2M$;o@dm_+{ide`&!VgU;*uWV@uo9>&bal89~qf({R|^ zkPi0MqUY~#Bu^`K1O`$D-25srz|>>V3$LXti;bY$~O?4W+k!2lAOS#i{dE6NK z12*Oy0q@W&)WlzeJHOF{JIoE~=o8P$#uq&_V@qH8#SErNXP7{B^ddp)vD;MU?FW2c zah?$~vm^<9Rj435q`}(B2zI6_aX+FOS)Fj6-uQS?FwE@=tCPb(W5Dlj0qYpxF?atJ-%N<@JaI)jSzc*ZYoxE-0%=)v-1_Q>iR95 zUsphr_*+QW_0598PrSG_g*60a!)SbOF3HqC4nbK5xf2o;8zi#RX?LatN$M#l;VmZI zyr&Zy#OGe2E3Qr>VT%T#@ez5#?8|S+>#rwO2mQIP#|Vg<{y*icl3M_*tMOfD>tNaWmRsVnb(?zZDWViZx^@Ik8Nh$ z1#AM9Shh>>N+Xs!g@w?ab$x^@o=1!*+C_z70~E5cd|aefkq$qB=Y5!g3FIoxM8Kv+>GEB>Ye7qy?e`0 z(D-^kH>BezSDJf^d664{4-MM{YXm20(t%(>*w@>5M*1~0NsbbHf8b2p1%`r8dUiTr2;O?HErjo7hho{%?x4XWcPX( z1yk$X@+|&<{@B7@Yu5I+mQ&_L>!2!gqy&;e3HV|~2#s-AX0zzgE@E_6R#2bn$!?I6CC|dt zp`H&UsdwaU>V z303J3S2T}^pL-2qD_3!{*LiRQ>-?!?Ujen6B24e7H?!(vGpX^TCc*iInIt2pn3J96 zg)4j$*&*~hGuW<$&Ro5c?!A12mUhc>OQwpEAkWo;4XwwiV}Kk6ztg5)CJV8gH!(0a zQMgn+I#Q%HoT}WHxiV294fQ4JG5PA?=kSxi|a(oAUNJCiH$~yrMkN z?`Z&4snLd+Tg=H2t9on}C^DISvr(qD6ig>B#Cg|!_#9_9npbT|jP>@za=jVU_EZ*z zH`inI+{5VpF&ES8E79n5D(OwQ3P&qn*z3IIRo)zhkEy)w; z{H9#oBAdc|xdGS{cnO!hmZugHx7h9R6Y0L*awgvCCB`eQ!%)>acBrB(sxI6Ho7Sll z*M@meGb*1oiucFOEgR9pb{Mzn+jr*Stgq;z5KL!g9|8@#*I=2wmU!OtLc?9 zcq(EJJr}0Ww6)*D<=N+CM7QhKr_>6|0`^gHs4S1&G6WA#6NT7oZSyLNI z-?%#xV_yq$*-VYBxp|ysHTh7E&3m^n!C_lik=S4o;33C-=Aug` zwuq8^)FCevOv%|@53tGU0DI)L5W$Y^I8p8(IkE2>RF;pYiW?@9gv)7UT!<9?z@8^7 z-g0SWNIuzfWj)>fP8^mDdjiZ=Uvg|@Ar!xihh4Uh;GFVW%&%}HSzq^(n!BO&{>?J< zb2|?kbO>qvZo|ztc80>DbL>{tZ#H7(7eQ-}JAF0c2mgXgF7|E;rAI)Mx{TWncQk}( zPV)&6%Bdt5^T&{s*0q?dc!|^U#g0r2FM;A~;dtQve4It4;d-i+;)}X9Q!d~ z)vF6zZcM=8%??B@el&Z!Ya8(ypF?}ZJ232f7#VG|51QixaoO&>D7AGc4O#jQPwfk# zPn0-dn7WevF z4alWmcSv#WmsS&}cjaV^*kQ6ZLX5IYJ~Hcz?xUT2EWYDcV7bULc6ebA79R5e)rn%< zx50&YMlT!uXS?EZ6s3nh#bJjikJhz(XO@Mg;hfm2`mx?u@Nu^v4YFZ5_rA!G)hkl? z2~*`!s>uRfOTw`3Nf_)Xy9RBRrLYH+IsTTj;7G$IR64y5?oX5_#!i-)xLBN2>b^&> ziw*4EqyqNs-8rP^f~vrvI1zj6&QqOfPnp2jb+}bpS1@yU3@3rLK?TeQqMj~*bF|(x^$Vj zJREr{L$;}jQMLRocpw~0D-J%vq!B{Yw`v~j+sP+d&8i?>(E=-H9K}6#Yss#(Rybj< zPqj?afquUN9NkQ|UOth!=vd>|2O~+ETq<+&#T*(e_mQ!hGa5Q=Q`s#~IJDME9R!9! zV5NTzj(%N-o6d8o&~inZKCS`szl?#4o7dCrB!#S;B2D*+JK({%t7IeBo|gK|hBsI0 zP)=?RvE128c0_!k4jc}B7BZe>zkf&5PgP_4mO>C2QbzaAzk{dKXOpXHNCyqeBs*77 z46b)4=A$i`+00I05=7~wxydm7gBDd8krMFw8ku=7MY+a?jSvcFk=b*LS{e4? zR<-~tf36oSxowa3H_K?;_qAm4LTkE&t3bvDU&QvCW+c3NGJPiT6nti@kh$9|sN0cF z5HC1|86ibPDYc!wlHkk)&pgN6$}S;!3PH5g?;@VnETjfUwQ!b69}H6zqR(Y_vZ=DC zfX)AoTkWGsq*N_5PO_n^H@};7P1uQCe!GLSsos*gcK<4Dd@>p~*VJ;7&gFxlM1%cGmmt%|^w z3?`-?LkWNPTi6k!3vtU%(Z#R2@nv8vZm&mU@a8lGuto&>QlcR;MIUqc42s!G1;d ziSj|xP^$o?rdj;6(Zg^_@he8_nIt`Ydla|lRTn$TEE68aJ;!xpCgV|AVeW;cV%$XK zTQu?si?76IaT)%0GW6$A?!}uyFfu%mgSmY_%Q4Q0+ke0I+icS?p=XVD;=$w2kiWT#op5vk4Y1e&J12{hR<#l~uet?p z7TMDq2j((Z?6pge!FfU*$qQ4W_W7w~d~g$} zNgW4aS6ygw+)eDOtcaksn{#A@7~RvA4^QXTaXwkSWFN0;W(-DtvstZK4;gV1L?<|s z4GGUK=!WJ=MsvJ6$EN&SEjaI{$ z4^ahpMBfd!B<8bfLTx&OGW3e zp}j));ia+;RgG7tHFs)2d58r_rwdb~NDd^wFQfW%Oh}-A71cZ0L+>?mX^oHoUKgK( z5ZOxl=)!hjr%Mv!cmvWDmdUqPxJWOJJZW=>(*{@0G{DYXPNbR`V`<$KFb>RQ1S(vb zpPWnbgoa@E3B&p){oe~At6$=f8ZUOW_#D<+dN7nIUSjl?+{2qoqM>rxB09;=jqTRx zLf#96K%v`MB9ceDi}I*LYBIax=1Uuy4Xc=Xr!&l5$3l{1cmjKizeD`^27b;qT_QIq z5?(&lW1>%F!t~J_h<5QwQoJ{xw0kV2Q#6}!VShV0_UcqTzVQV0ipZ(ov@wMoRZhi; zXhW~6z9QRAf8ttkd5C*+hPrNkPk0Ni;c(G@AEj$IJ>qG>$?;Amt9~exUAKqw&n?p; z@gpiY1OCAsbWj%>)^8s4m;b;&Fdy&_G;g*u&(@5>?vGB$Ug2=Ap4T^4qX zb!B|~#rTCqbu3NJK_Vi7A`iVebJ#_A);_zAbL9Y5-K@kS2MHL^?;X?UG2nd+l>b1# z#ejZk2ZAr3TQlaT)^gm1QhNuJW;>&39Kzn04d&&1SjeJnUkNnmGIdmP#Xzk=5 z6FJ4)IW+_B9yrTv_xS|b`*oO&cIL45>_|H5fi8=K#=u?`Kg^$Z4A#ZYC;27a;B9vo zN>f|$&gXgXu|SVXNp!*(_8^*kEMlT_hNFMF6`rGcwD(vAZW7(XaeJ_jA2YWc1!w?| zs;oKcFQ>r9dtBnykcTg?J*eMsO`OgXGQ+`lPOy?j(r_p2C<-6GhD)<|;fO)8%x2BY z(3D}@?`M97!*9ppl5o0R3M_nNi0yTe)a&#Ij_ULx zOmdx%+h1LR@}#|(ec70rbdM)b{DtYsIwgpX)n%({HsU-?Bf;^Uyb)LHIe z^RG%$$(eWQmqqfp@~{qmww0lt4bo)BMQ?m|&K#$lc0v1Ab=ZFO1sYver$efv*{XNN zp!#(a+-cZeuMwR|MA9N)hm|Gv*hZ4ja}uaBZej;^#S=?Y#vUcv5Z_ypW} zlc`va5ZUCL&W?ZU$X{mx&|&)%#QYCn%$9{9IG>E(1qwJqu7?G?dWL^vl8x!a4C3Fp z156wYXy&3kuv9N+E4=l|sA^mK(_|M!xGJ&=&pVjGQO7ZR#c<~C^9}gXO_9_IXJE10 zR@AE2g(Jf=v7z}M>~SoB0}Ed>&JR+VXU}%xc@qV8`tD+kpD`80PbD*5%T6*szXL?? zxWY+{JI*{|%0N0*iZ})zWLsP^8O|72dg)mTtn{yBEAlg7#6*4iG^2$Xq&x?$b9|Ut zVsq%2q|eL&`*R$vvF*5Y)g}}^zz0!dS*9~hk*e6t0&!k3`|zL!ZQQH_L2Gu9Vs1QC zK5JkcjGN%}yH=Z^!yS0#x+_NgG-Z{nBf&so2JU>ViZ_Jg$jToj*nL}+?$wc^D(*As zhB#&Nkt}AE9y_w8#x2Z`c|O$H!2knPRl)w}H;miOFz$T>=kA({YEG7;F7` zJ!@%x9#pe4fG-ND_INHhCrzX;{Ep%RA5Zw;m4piS=fU98WNiKU0Irk{rHRY)VT5oZ zojrGs!YS;&#q*+pBtH9YeGu6!Qj}PK@FrnLHS5ss;cG4+;6r-m*WQs zO|Sy1$0zCov|eF|uL1zSM&f@XQOW@_%39$0ydg{jE@z+XE2VHzZm%TOz1wlT2 z+_@Y5o}2yia-o|$;B zPaD$2MTnP-D|Fqz$eiif1UWZy@%q7VzRgizRzrIqNZ9LBorv8qZqjmkX!uVsQAq{s zZ$@-#;a#|Td?N&yw!+RMPnl_kxy;>>Ts)Qj1j_xT$exR?gcs~gil?)zjCmyUL_?V> z`;x zQ7uE)_9@dNgAy1+1Aqx{_v0~-XY8B{ENgOB2zC4~GQ*v2K3=YT!P(p74dEI` zShJIAa8)mb+JA9?kwx#Yd)7K|eD{@Y%{auE56K02;VH1%_aqZm?#(#I2El95Jxr#! zF?^i5h21quhhA6tjLL6XFr6J>`gNcPVzy+l*HE!^x^lLeJP6vvXfq zW1XlRFeSE#FKzJo`s+CT(sEQ-@)NtJdb0YX8gTw}8xsEN7_-_}p}+sN9@Ylcg0!k4 ze%&yd7Szhncq=9PF8Ug?y_Ey0DL>ItTt z_VS|nb^f_C@H9iKDf_u!R4g0XgOPW5;OJT~sT)~-eV&W@6Xkubl9f#UNdDfjwCN z6GK!#K~jYQ=gGP%{QP+fN`^}V@^(T-?RXk8%9P#Wzr3z%(n+W=3SnMG=D?#ncFeto zP^Kz00-qTSB^zFxQ84XUHCDC_V@7?=WbeGb%eTy}!69OG@M^F-d%M3cf5fpFV3lRUe7-9Mt?R$w znqDdVzF`OCYEtOAWe)+Fw$$b7c9^9ohwqOE^7BgOQziL#U|DHFJH2kRC;V39O4bL8 zR14X+7UnQ^_$PQ3!4QYo&)B-@1dg3^23@{8^V^C8U~_>i$yfLR0g+Si?8gjxL(~yn zH(Js;+UnH1auWE)>JY!W@pOOAQ_k~`D)fGdDZQYa!v?N;jUSKow=HCDaenY8(i2)= zQ1PTD4V=}0gVmF$hKCJGeT(L+oq7by6GyYT*~#?7vGZVBrv_@XIsDU`9x)yFOmOum z2jcbZD>jm)G%jj4X#2%M^BiqHqg_E(j?F|*aV;j~j4X`O4ky-1lC-q$HeM6i2OW<* z>4^&)!S?P*YOpDQIcT;WF8STTq|2scdV~|(qFw+OO_T9{;6oTEWkhukC=dltTRPpY zgmT&qQNB@@UjMomj~N)?^rh9-UtWoVplKz$t*aT+Z;vOnp_1fW-!+)DQHpprTeIh4 zp2Me50j6&aWY+5sC8j-t$X)IOX8j-~a4KqJ3Y@C?{h+hxG37IVLF)!Q;JO3#4~P@J z5JkGaND1Dw?Scut`}ryoR~TXc`uau3G?}im1+Yp5u=(>Gc%`$Zznyv-&K0ghqlkmh ze7c$lt#&5&rp2Qni5J4S6uTZvxf*{gMy8Em@hELYyiX z0!?FVAxC}>BmZL>-sPWX$88@&r?`E=vhLl`Whn`^6 zOtCu-LyJ=3QrJCkd%hJs&0WyQDI8YL>4M7Ix%5U&KC?u2ESNRb^HSaSmQMvq|L zNDV`Y7z1de>3Hc!0ChaL9xRnqK;L^dQ)c8sCM-&U4cn4oI9_9(x36Y{Q}*H!ZX$o3 zK!;=MW=f-JIy5Eh!Q7`iu&TV3U9xi_%D&w~ZuDfc`g|2K%3X$dc4{#v!x~^1lLMFf zH0VJKhPp2`=6o@*M=9S6%=gtz=yT2%PMS*cI|ljFv}9Fy*eOl!`&2QnBRA2;<$8=s zyD%!(lwsG6^>niN0_Yku4)Vt^pqmQiZ5iQnVEDvste!$$AbC{E!{E4aP8^U&vZsM!oxnIlUR5QhO ziIBN3gz?*U6bJN6U3hHtzwkdM4Cp`T`i6fZe3ngjbus5k=W+hkcXv5mfs!l^UCck7wPLJmq& zPIHobrq@qYScOqnsv$RjIrL5($G`t040ydV%-&!{&Ucp^h>x8(j$fa#S{F>1CFR;s z*>CD>5pt zlX*1=hHhg}ATg9g-*HB@TXG=8_h*|lY@mIk4>Q|C7K!IKw*B)+e(jE(*e)@h&zrZF zX-F7KtNnV=wkC+`mz47dy*G!bp1U?3jfbG^nKxd|eh5bSlISv{jPr5x3nrJ%z$*#m z?9^p+zJ?3Dc{!Ao{0WXiu_G|Fc(Td%LdMCojBk+TBM$ZII6(Z*5mOHkNy&Nh9+|Nmz;f=%CwEa~a<+G!4l!Gpw+Sbfm z_<@XVr4OV0qmiSdI}7BF4Pmc}DYKT#-|^pHkOfDrL!8Xna?GLhHqPm^x$G04D@>5r zR7Sx_6KAZFV+Qg!^l2n-tw_Ir*I)hv|ITV4e^ofkP-@KIdBEnix+zJix(Ow?#{vk ztN)%8^81MK0h9I5J{EYky+h$Nz=3`WyRu=E(l>|CaxT{a0?fI}aLU_^+T*qW#zOuSN2&^_+ha nP)JDL?AL*Q+`xPwp(Pf-wg)EuxI2pv%=;0N?EekK|BC%T!Zc6A literal 0 HcmV?d00001 diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf index 51a84eee..f7f55186 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf @@ -562,7 +562,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -607,7 +607,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -660,7 +660,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -711,7 +711,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -790,7 +790,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -835,7 +835,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -888,7 +888,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -939,7 +939,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf new file mode 100644 index 00000000..5e76f34a --- /dev/null +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -0,0 +1,996 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From c9411cdf0c07b55d7095b07233abbf41f5eeb5c0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 18 Mar 2024 14:04:09 -0400 Subject: [PATCH 16/98] some tuning --- gym/envs/mit_humanoid/mit_humanoid.py | 17 ++++- gym/envs/mit_humanoid/mit_humanoid_config.py | 72 ++++++++++---------- 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 17167286..684f4cb3 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -111,8 +111,9 @@ def _reward_tracking_lin_vel(self): def _reward_dof_vel(self): # Penalize dof velocities - return torch.mean(self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1) - # * self._switch("stand") + return torch.mean( + self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1 + ) * self._switch("stand") def _reward_dof_near_home(self): return torch.mean( @@ -131,7 +132,7 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return rew_vel + rew_pos - rew_base_vel # * self._switch("stand") + return rew_vel + rew_pos - rew_base_vel * self._switch("stand") def _reward_stance(self): phase = torch.maximum( @@ -151,3 +152,13 @@ def _compute_grf(self, grf_norm=True): return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) else: return grf + + def _reward_walk_freq(self): + # Penalize deviation from base frequency + return torch.mean( + self._sqrdexp( + (self.oscillator_freq - self.cfg.oscillator.base_frequency) + / self.cfg.oscillator.base_frequency + ), + dim=1, + ) * self._switch("move") diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 826480e7..7a8680e8 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -70,15 +70,15 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.7, 0.72], # z + [0.7, 0.8], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], ] # yaw root_vel_range = [ - [-0.5, 1.5], # x - [-0.55, 0.55], # y + [-0.5, 2.5], # x + [-0.5, 0.5], # y [-0.35, 0.1], # z [-0.35, 0.35], # roll [-0.35, 0.35], # pitch @@ -122,18 +122,18 @@ class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 1.0] # min max [m/s] + lin_vel_x = [-1.0, 4.0] # min max [m/s] lin_vel_y = 1.0 # max [m/s] yaw_vel = 1 # max [rad/s] class push_robots: toggle = False - interval_s = 15 - max_push_vel_xy = 0.05 - push_box_dims = [0.1, 0.2, 0.3] # x,y,z [m] + interval_s = 2 + max_push_vel_xy = 0.6 + push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] class domain_rand: - randomize_friction = False + randomize_friction = True friction_range = [0.5, 1.25] randomize_base_mass = True added_mass_range = [-1.0, 1.0] @@ -157,7 +157,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False disable_motors = False - apply_humanoid_jacobian = False total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): @@ -195,22 +194,22 @@ class scaling(LeggedRobotCfg.scaling): dof_pos = [ 0.1, 0.2, - 0.4, - 0.4, - 0.4, + 0.8, + 0.8, + 0.8, 0.1, 0.2, - 0.4, - 0.4, - 0.4, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, + 0.8, + 0.8, + 0.8, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, ] dof_pos_obs = dof_pos # # * Action scales @@ -248,7 +247,7 @@ class policy(LeggedRobotRunnerCfg.policy): actor_hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" actor_obs = [ "base_height", @@ -266,13 +265,13 @@ class policy(LeggedRobotRunnerCfg.policy): actions = ["dof_pos_target"] class noise: - base_height = 0.0 # 0.05 - dof_pos_obs = 0.0 # 0.0 - dof_vel = 0.0 # 0.0 - base_lin_vel = 0.0 # 0.1 - base_ang_vel = 0.0 # 0.2 - projected_gravity = 0.0 # 0.05 - height_measurements = 0.0 # 0.1 + base_height = 0.05 + dof_pos_obs = 0.0 + dof_vel = 0.0 + base_lin_vel = 0.1 + base_ang_vel = 0.2 + projected_gravity = 0.05 + height_measurements = 0.1 class reward: class weights: @@ -285,12 +284,13 @@ class weights: action_rate2 = 0.001 lin_vel_z = 0.0 ang_vel_xy = 0.0 - dof_vel = 0.0 - stand_still = 0.0 + dof_vel = 0.25 + stand_still = 0.25 dof_pos_limits = 0.25 - dof_near_home = 0.1 + dof_near_home = 0.25 stance = 1.0 swing = 1.0 + walk_freq = 0.5 class termination_weight: termination = 15 @@ -300,7 +300,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.001 + entropy_coef = 0.002 num_learning_epochs = 5 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 @@ -308,8 +308,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): schedule = "adaptive" # could be adaptive, fixed gamma = 0.999 lam = 0.95 - discount_horizon = 0.5 # [s] - GAE_bootstrap_horizon = 0.2 # [s] desired_kl = 0.01 max_grad_norm = 1.0 From 90ff50cc34ba6a41f442a3c732e3c53dd1c600d5 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 18 Mar 2024 19:57:46 -0400 Subject: [PATCH 17/98] implement history with (approximated) arbitrary sampling frequency and history length. Implemented for dof_pos_target, dof_pos, dof_vel. --- gym/envs/mit_humanoid/mit_humanoid.py | 49 ++++++++++++++++++++ gym/envs/mit_humanoid/mit_humanoid_config.py | 19 ++------ 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 684f4cb3..a7fb56cc 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -15,6 +15,31 @@ def _init_buffers(self): self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) + self.dof_pos_target_history = torch.zeros( + (self.num_envs, self.num_dof, 12), device=self.device + ) + self._init_history_buffers() + + def _init_history_buffers(self): + self.dof_pos_target_history = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.history_length), + device=self.device, + ) + self.dof_pos_history = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.history_length, + device=self.device, + ) + self.dof_vel_history = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.history_length, + device=self.device, + ) + self.history_counter = torch.zeros(self.num_envs, dtype=int, device=self.device) + self.history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.history_frequency + ) + def _reset_system(self, env_ids): if len(env_ids) == 0: return @@ -31,8 +56,15 @@ def _reset_system(self, env_ids): self.oscillator_obs = torch.cat( (self.oscillators.cos(), self.oscillators.sin()), dim=1 ) + self._reset_history_buffers(env_ids) return + def _reset_history_buffers(self, ids): + n = self.cfg.env.history_length + self.dof_pos_target_history[ids] = self.dof_pos_target[ids].tile(n) + self.dof_pos_history[ids] = self.dof_pos_target[ids].tile(n) + self.dof_vel_history[ids] = self.dof_pos_target[ids].tile(n) + # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): torques = _apply_coupling( @@ -52,6 +84,7 @@ def _compute_torques(self): def _post_decimation_step(self): super()._post_decimation_step() self._step_oscillators() + self._update_history_buffers() def _step_oscillators(self, dt=None): if dt is None: @@ -62,6 +95,22 @@ def _step_oscillators(self, dt=None): (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 ) + def _update_history_buffers(self): + self.history_counter += 1 + + ids = torch.nonzero( + self.history_counter == self.history_threshold, as_tuple=False + ).flatten() + + self.dof_pos_target_history[ids].roll(self.num_dof, dims=1) # check + self.dof_pos_target_history[ids, : self.num_dof] = self.dof_pos_target[ids] + self.dof_pos_history[ids].roll(self.num_dof, dims=1) # check + self.dof_pos_history[ids, : self.num_dof] = self.dof_pos_target[ids] + self.dof_vel_history[ids].roll(self.num_dof, dims=1) # check + self.dof_vel_history[ids, : self.num_dof] = self.dof_pos_target[ids] + + self.history_counter[ids] = 0 + # --- rewards --- def _switch(self, mode=None): diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 7a8680e8..04433381 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -12,7 +12,9 @@ class env(LeggedRobotCfg.env): num_observations = 49 + 3 * 18 # 121 num_actuators = 18 episode_length_s = 5 # episode length in seconds - num_privileged_obs = num_observations + + history_length = 3 # n samples + history_frequency = 10 # [Hz] class terrain(LeggedRobotCfg.terrain): pass @@ -172,21 +174,6 @@ class reward_settings(LeggedRobotCfg.reward_settings): switch_threshold = 0.2 class scaling(LeggedRobotCfg.scaling): - # * dimensionless time: sqrt(L/g) or sqrt(I/[mgL]), with I=I0+mL^2 - # virtual_leg_length = 0.65 - # dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 - # base_height = virtual_leg_length - # base_lin_vel = virtual_leg_length / dimensionless_time - # base_ang_vel = 3.14 / dimensionless_time - # dof_vel = 20 # ought to be roughly max expected speed. - # height_measurements = virtual_leg_length - - # # todo check order of joints, create per-joint scaling - # dof_pos = 3.14 - # dof_pos_obs = dof_pos - # # * Action scales - # dof_pos_target = dof_pos - # tau_ff = 0.1 base_ang_vel = 2.5 base_lin_vel = 1.5 commands = 1 From 5020fd023f657116d0ebc5d254bcb98f20c2d215 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 18 Mar 2024 20:20:22 -0400 Subject: [PATCH 18/98] make pd gains specific for each robot in preparation for domain randomization of gains --- gym/envs/base/legged_robot.py | 33 ++++++-------------- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 +- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index 1bc4623e..7fa2f0dd 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -481,41 +481,26 @@ def _init_buffers(self): get_axis_params(-1.0, self.up_axis_idx), device=self.device ).repeat((self.num_envs, 1)) self.torques = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.p_gains = torch.zeros( - self.num_actuators, dtype=torch.float, device=self.device + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.d_gains = torch.zeros( - self.num_actuators, dtype=torch.float, device=self.device + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_pos_target = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_vel_target = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.tau_ff = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_pos_history = torch.zeros( - self.num_envs, - self.num_actuators * 3, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators * 3, dtype=torch.float, device=self.device ) self.commands = torch.zeros( self.num_envs, 3, dtype=torch.float, device=self.device @@ -582,8 +567,8 @@ def _init_buffers(self): found = False for dof_name in self.cfg.control.stiffness.keys(): if dof_name in name: - self.p_gains[i] = self.cfg.control.stiffness[dof_name] - self.d_gains[i] = self.cfg.control.damping[dof_name] + self.p_gains[:, i] = self.cfg.control.stiffness[dof_name] + self.d_gains[:, i] = self.cfg.control.damping[dof_name] found = True if not found: self.p_gains[i] = 0.0 diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 04433381..1e079807 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -277,7 +277,7 @@ class weights: dof_near_home = 0.25 stance = 1.0 swing = 1.0 - walk_freq = 0.5 + walk_freq = 0.0 class termination_weight: termination = 15 From 82968b164fd94cbbfe95efb6d303877ed4ba6b2b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 19 Mar 2024 16:58:32 -0400 Subject: [PATCH 19/98] refactor sampled history to be more explicit and not overlap with `dof_pos_history` (used for smoothness reward). Some tuning. Change stance/swing reward to use the smoothed square wave. --- gym/envs/base/legged_robot.py | 12 +-- gym/envs/mit_humanoid/mit_humanoid.py | 95 ++++++++++++-------- gym/envs/mit_humanoid/mit_humanoid_config.py | 89 +++++++++--------- scripts/play.py | 2 +- 4 files changed, 109 insertions(+), 89 deletions(-) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index 7fa2f0dd..d635fea7 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -110,10 +110,10 @@ def _post_decimation_step(self): self.base_height = self.root_states[:, 2:3] - n = self.num_actuators - self.dof_pos_history[:, 2 * n :] = self.dof_pos_history[:, n : 2 * n] - self.dof_pos_history[:, n : 2 * n] = self.dof_pos_history[:, :n] - self.dof_pos_history[:, :n] = self.dof_pos_target + self.dof_pos_history.roll(self.num_actuators) + self.dof_pos_history[:, : self.num_actuators] = ( + self.dof_pos - self.default_dof_pos + ) self.dof_pos_obs = self.dof_pos - self.default_dof_pos env_ids = ( @@ -133,7 +133,9 @@ def _reset_idx(self, env_ids): self._reset_system(env_ids) self._resample_commands(env_ids) # * reset buffers - self.dof_pos_history[env_ids] = 0.0 + self.dof_pos_history[env_ids] = ( + self.dof_pos[env_ids] - self.default_dof_pos + ).tile(3) self.episode_length_buf[env_ids] = 0 def _initialize_sim(self): diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index a7fb56cc..5db4ab7a 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -14,30 +14,28 @@ def _init_buffers(self): self.oscillators = torch.zeros(self.num_envs, 2, device=self.device) self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) + self._init_sampled_history_buffers() - self.dof_pos_target_history = torch.zeros( - (self.num_envs, self.num_dof, 12), device=self.device - ) - self._init_history_buffers() - - def _init_history_buffers(self): - self.dof_pos_target_history = torch.zeros( - (self.num_envs, self.num_dof * self.cfg.env.history_length), + def _init_sampled_history_buffers(self): + self.sampled_history_dof_pos_target = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), device=self.device, ) - self.dof_pos_history = torch.zeros( + self.sampled_history_dof_pos = torch.zeros( self.num_envs, - self.num_dof * self.cfg.env.history_length, + self.num_dof * self.cfg.env.sampled_history_length, device=self.device, ) - self.dof_vel_history = torch.zeros( + self.sampled_history_dof_vel = torch.zeros( self.num_envs, - self.num_dof * self.cfg.env.history_length, + self.num_dof * self.cfg.env.sampled_history_length, device=self.device, ) - self.history_counter = torch.zeros(self.num_envs, dtype=int, device=self.device) - self.history_threshold = int( - self.cfg.control.ctrl_frequency / self.cfg.env.history_frequency + self.sampled_history_counter = torch.zeros( + self.num_envs, dtype=int, device=self.device + ) + self.sampled_history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency ) def _reset_system(self, env_ids): @@ -56,14 +54,14 @@ def _reset_system(self, env_ids): self.oscillator_obs = torch.cat( (self.oscillators.cos(), self.oscillators.sin()), dim=1 ) - self._reset_history_buffers(env_ids) + self._reset_sampled_history_buffers(env_ids) return - def _reset_history_buffers(self, ids): - n = self.cfg.env.history_length - self.dof_pos_target_history[ids] = self.dof_pos_target[ids].tile(n) - self.dof_pos_history[ids] = self.dof_pos_target[ids].tile(n) - self.dof_vel_history[ids] = self.dof_pos_target[ids].tile(n) + def _reset_sampled_history_buffers(self, ids): + n = self.cfg.env.sampled_history_length + self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): @@ -84,7 +82,7 @@ def _compute_torques(self): def _post_decimation_step(self): super()._post_decimation_step() self._step_oscillators() - self._update_history_buffers() + self._update_sampled_history_buffers() def _step_oscillators(self, dt=None): if dt is None: @@ -95,21 +93,24 @@ def _step_oscillators(self, dt=None): (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 ) - def _update_history_buffers(self): - self.history_counter += 1 + def _update_sampled_history_buffers(self): + self.sampled_history_counter += 1 ids = torch.nonzero( - self.history_counter == self.history_threshold, as_tuple=False + self.sampled_history_counter == self.sampled_history_threshold, + as_tuple=False, ).flatten() - self.dof_pos_target_history[ids].roll(self.num_dof, dims=1) # check - self.dof_pos_target_history[ids, : self.num_dof] = self.dof_pos_target[ids] - self.dof_pos_history[ids].roll(self.num_dof, dims=1) # check - self.dof_pos_history[ids, : self.num_dof] = self.dof_pos_target[ids] - self.dof_vel_history[ids].roll(self.num_dof, dims=1) # check - self.dof_vel_history[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_pos_target[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ + ids + ] + self.sampled_history_dof_pos[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_vel[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] - self.history_counter[ids] = 0 + self.sampled_history_counter[ids] = 0 # --- rewards --- @@ -167,7 +168,7 @@ def _reward_dof_vel(self): def _reward_dof_near_home(self): return torch.mean( self._sqrdexp( - (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos_obs"] + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] ), dim=1, ) @@ -184,15 +185,17 @@ def _reward_stand_still(self): return rew_vel + rew_pos - rew_base_vel * self._switch("stand") def _reward_stance(self): - phase = torch.maximum( - torch.zeros_like(self.oscillators), -self.oscillators.sin() - ) # positive during swing, negative during stance + # phase = torch.maximum( + # torch.zeros_like(self.oscillators), -self.oscillators.sin() + # ) # positive during swing, negative during stance + phase = self.smooth_sqr_wave(self.oscillators) return (phase * self._compute_grf()).mean(dim=1) def _reward_swing(self): - phase = torch.maximum( - torch.zeros_like(self.oscillators), self.oscillators.sin() - ) # positive during swing, negative during stance + # phase = torch.maximum( + # torch.zeros_like(self.oscillators), self.oscillators.sin() + # ) # positive during swing, negative during stance + phase = self.smooth_sqr_wave(self.oscillators + torch.pi) return -(phase * self._compute_grf()).mean(dim=1) def _compute_grf(self, grf_norm=True): @@ -202,6 +205,9 @@ def _compute_grf(self, grf_norm=True): else: return grf + def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function + return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 + def _reward_walk_freq(self): # Penalize deviation from base frequency return torch.mean( @@ -211,3 +217,14 @@ def _reward_walk_freq(self): ), dim=1, ) * self._switch("move") + + def _reward_hips_forward(self): + # reward hip motors for pointing forward + hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.stack( + (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 + ) + hip_yaw_abad /= torch.stack( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + ) + return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 1e079807..89359a8c 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -9,12 +9,11 @@ class MITHumanoidCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 - num_observations = 49 + 3 * 18 # 121 num_actuators = 18 episode_length_s = 5 # episode length in seconds - history_length = 3 # n samples - history_frequency = 10 # [Hz] + sampled_history_length = 3 # n samples + sampled_history_frequency = 10 # [Hz] class terrain(LeggedRobotCfg.terrain): pass @@ -29,17 +28,17 @@ class init_state(LeggedRobotCfg.init_state): default_joint_angles = { "hip_yaw": 0.0, "hip_abad": 0.0, - "hip_pitch": -0.4, - "knee": 0.9, - "ankle": -0.45, + "hip_pitch": -0.667751, + "knee": 1.4087, + "ankle": -0.708876, "shoulder_pitch": 0.0, "shoulder_abad": 0.0, "shoulder_yaw": 0.0, - "elbow": 0.0, + "elbow": -1.25, } # * default COM for basic initialization - pos = [0.0, 0.0, 0.66] # x,y,z [m] + pos = [0.0, 0.0, 0.72] # x,y,z [m] rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] @@ -47,22 +46,22 @@ class init_state(LeggedRobotCfg.init_state): # * initialization for random range setup dof_pos_range = { - "hip_yaw": [0.0, 0.0], - "hip_abad": [0.0, 0.0], - "hip_pitch": [-0.29, -0.25], - "knee": [0.67, 0.71], - "ankle": [-0.43, -0.39], + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.2, 0.2], + "knee": [-0.2, 0.2], + "ankle": [-0.15, 0.15], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], + "elbow": [0, 0], } dof_vel_range = { - "hip_yaw": [-0.0, 0.1], - "hip_abad": [-0.0, 0.1], - "hip_pitch": [-0.1, -0.1], - "knee": [-0.05, 0.05], - "ankle": [-0.05, 0.05], + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.1, 0.1], + "knee": [-0.1, 0.1], + "ankle": [-0.1, 0.1], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], @@ -72,15 +71,15 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.7, 0.8], # z + [0.72, 0.72], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], ] # yaw root_vel_range = [ - [-0.5, 2.5], # x - [-0.5, 0.5], # y + [-0.75, 2.75], # x + [-0.55, 0.55], # y [-0.35, 0.1], # z [-0.35, 0.35], # roll [-0.35, 0.35], # pitch @@ -98,7 +97,7 @@ class control(LeggedRobotCfg.control): "shoulder_pitch": 40.0, "shoulder_abad": 40.0, "shoulder_yaw": 40.0, - "elbow": 40.0, + "elbow": 50.0, } # [N*m/rad] damping = { "hip_yaw": 2.0, @@ -118,19 +117,19 @@ class control(LeggedRobotCfg.control): filter_gain = 0.1586 # 1: no filtering, 0: wall class oscillator: - base_frequency = 1.5 # [Hz] + base_frequency = 2.0 # [Hz] class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 4.0] # min max [m/s] - lin_vel_y = 1.0 # max [m/s] - yaw_vel = 1 # max [rad/s] + lin_vel_x = [-1.0, 4.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.35 # max [m/s] + yaw_vel = 0.5 # max [rad/s] class push_robots: - toggle = False - interval_s = 2 + toggle = True + interval_s = 1 max_push_vel_xy = 0.6 push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] @@ -147,11 +146,11 @@ class asset(LeggedRobotCfg.asset): ) # foot_collisionbox_names = ["foot"] foot_name = "foot" - penalize_contacts_on = ["base", "arm"] + penalize_contacts_on = ["arm"] terminate_after_contacts_on = ["base"] - end_effector_names = ["hand", "foot"] + end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False - self_collisions = 1 # 1 to disagble, 0 to enable...bitwise filter + self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter collapse_fixed_joints = False # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) @@ -198,7 +197,6 @@ class scaling(LeggedRobotCfg.scaling): 0.1, 0.1, ] - dof_pos_obs = dof_pos # # * Action scales dof_pos_target = dof_pos dof_vel = [ @@ -221,7 +219,7 @@ class scaling(LeggedRobotCfg.scaling): 1.0, 1.0, ] - dof_pos_history = 3 * dof_pos_obs + dof_pos_history = 3 * dof_pos class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): @@ -231,8 +229,8 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): class policy(LeggedRobotRunnerCfg.policy): disable_actions = False init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + actor_hidden_dims = [256, 256, 128] + critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" @@ -245,6 +243,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_obs", "dof_vel", "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", "oscillator_obs", ] critic_obs = actor_obs @@ -252,18 +253,17 @@ class policy(LeggedRobotRunnerCfg.policy): actions = ["dof_pos_target"] class noise: - base_height = 0.05 - dof_pos_obs = 0.0 - dof_vel = 0.0 - base_lin_vel = 0.1 - base_ang_vel = 0.2 - projected_gravity = 0.05 - height_measurements = 0.1 + dof_pos = 0.005 + dof_vel = 0.05 + base_ang_vel = 0.025 + base_lin_vel = 0.025 + projected_gravity = 0.01 + feet_contact_state = 0.025 class reward: class weights: - tracking_ang_vel = 1.5 tracking_lin_vel = 3.0 + tracking_ang_vel = 1.5 orientation = 1.0 torques = 5.0e-4 min_base_height = 1.5 @@ -277,6 +277,7 @@ class weights: dof_near_home = 0.25 stance = 1.0 swing = 1.0 + hips_forward = 0.5 walk_freq = 0.0 class termination_weight: diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..eb32855e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -18,7 +18,7 @@ def setup(args): env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" + # env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) From 3ae04a080730dd24269d36c222ecc091e936d311 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 23 Mar 2024 12:29:28 -0400 Subject: [PATCH 20/98] properly roll histories, retune action_rate for mc --- gym/envs/base/legged_robot.py | 2 +- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +-- gym/envs/mit_humanoid/mit_humanoid.py | 12 +++++-- gym/envs/mit_humanoid/mit_humanoid_config.py | 36 +++++++++---------- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index d635fea7..68178650 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -110,7 +110,7 @@ def _post_decimation_step(self): self.base_height = self.root_states[:, 2:3] - self.dof_pos_history.roll(self.num_actuators) + self.dof_pos_history = self.dof_pos_history.roll(self.num_actuators) self.dof_pos_history[:, : self.num_actuators] = ( self.dof_pos - self.default_dof_pos ) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7911206a..1a4e0aed 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -113,8 +113,8 @@ class weights: dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 0.01 - action_rate2 = 0.001 + action_rate = 1.0 + action_rate2 = 1.0 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 5db4ab7a..489b463d 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -101,13 +101,19 @@ def _update_sampled_history_buffers(self): as_tuple=False, ).flatten() - self.sampled_history_dof_pos_target[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos_target[ids] = torch.roll( + self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 + ) # check self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ ids ] - self.sampled_history_dof_pos[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos[ids] = torch.roll( + self.sampled_history_dof_pos[ids], self.num_dof, dims=1 + ) # check self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] - self.sampled_history_dof_vel[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_vel[ids] = torch.roll( + self.sampled_history_dof_vel[ids], self.num_dof, dims=1 + ) # check self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] self.sampled_history_counter[ids] = 0 diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 89359a8c..54813f7b 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -38,7 +38,7 @@ class init_state(LeggedRobotCfg.init_state): } # * default COM for basic initialization - pos = [0.0, 0.0, 0.72] # x,y,z [m] + pos = [0.0, 0.0, 0.6] # x,y,z [m] rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] @@ -46,11 +46,11 @@ class init_state(LeggedRobotCfg.init_state): # * initialization for random range setup dof_pos_range = { - "hip_yaw": [-0.1, 0.1], - "hip_abad": [-0.1, 0.1], - "hip_pitch": [-0.2, 0.2], - "knee": [-0.2, 0.2], - "ankle": [-0.15, 0.15], + "hip_yaw": [-0.0, 0.0], + "hip_abad": [-0.0, 0.0], + "hip_pitch": [-0.667751, -0.667751], + "knee": [1.4087, 1.4087], + "ankle": [-0.708876, -0.708876], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], @@ -71,7 +71,7 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.72, 0.72], # z + [0.64, 0.7], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], @@ -250,7 +250,7 @@ class policy(LeggedRobotRunnerCfg.policy): ] critic_obs = actor_obs - actions = ["dof_pos_target"] + actions = ["dof_pos_target", "oscillator_freq"] class noise: dof_pos = 0.005 @@ -262,23 +262,23 @@ class noise: class reward: class weights: - tracking_lin_vel = 3.0 - tracking_ang_vel = 1.5 - orientation = 1.0 + # tracking_lin_vel = 3.0 + # tracking_ang_vel = 1.5 + # orientation = 1.0 torques = 5.0e-4 - min_base_height = 1.5 + # min_base_height = 1.5 action_rate = 0.01 action_rate2 = 0.001 lin_vel_z = 0.0 ang_vel_xy = 0.0 - dof_vel = 0.25 - stand_still = 0.25 + # dof_vel = 0.25 + # stand_still = 0.25 dof_pos_limits = 0.25 - dof_near_home = 0.25 + # dof_near_home = 0.25 stance = 1.0 swing = 1.0 hips_forward = 0.5 - walk_freq = 0.0 + walk_freq = 2.5 class termination_weight: termination = 15 @@ -288,11 +288,11 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.002 + entropy_coef = 0.01 num_learning_epochs = 5 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 5.0e-4 + learning_rate = 5.0e-5 schedule = "adaptive" # could be adaptive, fixed gamma = 0.999 lam = 0.95 From ee05de48a620bcf598002a4b76779d9d92be3612 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sun, 24 Mar 2024 18:30:55 -0400 Subject: [PATCH 21/98] enable arms in learnt urdf diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 54813f7..368f2c3 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -142,7 +142,7 @@ class MITHumanoidCfg(LeggedRobotCfg): class asset(LeggedRobotCfg.asset): file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_R_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) # foot_collisionbox_names = ["foot"] foot_name = "foot" @@ -289,12 +289,12 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): use_clipped_value_loss = True clip_param = 0.2 entropy_coef = 0.01 - num_learning_epochs = 5 + num_learning_epochs = 4 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 5.0e-5 + learning_rate = 1.0e-6 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 + gamma = 0.99 lam = 0.95 desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf index 5e76f34..98a0773 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -570,7 +570,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -615,7 +615,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -668,7 +668,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -719,7 +719,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -798,7 +798,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -843,7 +843,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -896,7 +896,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -947,7 +947,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> --- gym/envs/mit_humanoid/mit_humanoid_config.py | 8 ++++---- .../mit_humanoid/urdf/humanoid_F_sf_learnt.urdf | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 54813f7b..368f2c3b 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -142,7 +142,7 @@ class domain_rand: class asset(LeggedRobotCfg.asset): file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_R_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) # foot_collisionbox_names = ["foot"] foot_name = "foot" @@ -289,12 +289,12 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): use_clipped_value_loss = True clip_param = 0.2 entropy_coef = 0.01 - num_learning_epochs = 5 + num_learning_epochs = 4 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 5.0e-5 + learning_rate = 1.0e-6 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 + gamma = 0.99 lam = 0.95 desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf index 5e76f34a..98a0773a 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -570,7 +570,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -615,7 +615,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -668,7 +668,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -719,7 +719,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -798,7 +798,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -843,7 +843,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -896,7 +896,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -947,7 +947,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> From 0aab1005ea71ad3b39712ad4a5dcbd81845d3c8c Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sun, 24 Mar 2024 19:15:55 -0400 Subject: [PATCH 22/98] hotfix old runner, and set as default for humanoid. New runner is numerically unstable for humanoid --- gym/envs/mit_humanoid/mit_humanoid_config.py | 4 ++-- learning/runners/old_policy_runner.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 368f2c3b..a3e113b6 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -224,7 +224,7 @@ class scaling(LeggedRobotCfg.scaling): class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "OldPolicyRunner" class policy(LeggedRobotRunnerCfg.policy): disable_actions = False @@ -301,7 +301,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" num_steps_per_env = 32 max_iterations = 1000 run_name = "Standing" diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 9a738b93..f42b26cf 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -70,7 +70,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") From de64f6c201831ba888c4fba4825996506ff6ae21 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 10:53:01 -0400 Subject: [PATCH 23/98] WIP: more clean post actor/critic, exporting --- learning/algorithms/ppo2.py | 13 +++------- learning/runners/on_policy_runner.py | 19 +++++++------- setup.py | 5 ++-- .../test_runner_integration.py | 25 ++++++++----------- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 69d6dcd2..bc044d08 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -9,11 +9,10 @@ class PPO2: - # actor_critic: ActorCritic - def __init__( self, - actor_critic, + actor, + critic, num_learning_epochs=1, num_mini_batches=1, clip_param=0.2, @@ -35,12 +34,8 @@ def __init__( self.learning_rate = learning_rate # * PPO components - # todo keeping actor_critic for loading code in runner - self.actor_critic = actor_critic.to(self.device) - self.actor = actor_critic.actor.to(self.device) - self.critic = actor_critic.critic.to(self.device) - self.storage = None # initialized later - # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.actor = actor.to(self.device) + self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 91e2ac26..b93ff0de 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -144,17 +144,15 @@ def set_up_logger(self): "actor", self.alg.actor_critic, ["action_std", "entropy"] ) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( { - "actor_state_dict": self.alg.actor_critic.actor.state_dict(), - "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "iter": self.it, }, @@ -163,18 +161,19 @@ def save(self): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.it = loaded_dict["iter"] def switch_to_eval(self): - self.alg.actor_critic.eval() + self.alg.actor.eval() + self.alg.critic.eval() def get_inference_actions(self): obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) - return self.alg.actor_critic.actor.act_inference(obs) + return self.alg.actor.act_inference(obs) def export(self, path): - self.alg.actor_critic.export_policy(path) + self.alg.actor.export(path) diff --git a/setup.py b/setup.py index c918d911..8a3685da 100644 --- a/setup.py +++ b/setup.py @@ -2,17 +2,16 @@ from distutils.core import setup setup( - name="gpuGym", + name="pkGym", version="1.0.1", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), - description="Isaac Gym environments for Legged Robots", + description="Isaac Gym environments", install_requires=[ "isaacgym", "matplotlib", "pandas", - "tensorboard", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 4781a665..ef2fc6bb 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -7,7 +7,7 @@ # * imports for onnx test import onnx import onnxruntime -from learning.modules import ActorCritic +from learning.modules import Actor from gym.utils.helpers import get_load_path from gym import LEGGED_GYM_ROOT_DIR @@ -28,20 +28,17 @@ def learn_policy(args): def load_saved_policy(runner): num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_critic_obs = runner.get_obs_size(runner.policy_cfg["critic_obs"]) num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **runner.policy_cfg - ).to(runner.device) + actor = Actor(num_actor_obs, num_actions, **runner.policy_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) loaded_dict = torch.load(resume_path) - actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - actor_critic.eval() - return actor_critic + actor.load_state_dict(loaded_dict["actor_state_dict"]) + actor.eval() + return actor class TestDefaultIntegration: @@ -96,19 +93,19 @@ def test_default_integration_settings(self, args): ) obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) - actions_first = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) - actions_loaded = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() assert torch.equal(actions_first, actions_loaded), "Model loading failed" # * test onnx exporting - loaded_actor_critic = load_saved_policy(runner) + loaded_actor = load_saved_policy(runner) export_path = os.path.join( LEGGED_GYM_ROOT_DIR, "tests", "integration_tests", "exported_policy" ) - loaded_actor_critic.export_policy(export_path) + loaded_actor.export(export_path) # load the exported onnx path_to_onnx = os.path.join(export_path, "policy.onnx") @@ -122,8 +119,8 @@ def test_default_integration_settings(self, args): # compute torch output with torch.no_grad(): test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] - runner_out = runner.alg.actor_critic.actor.act_inference(test_input) - loaded_out = loaded_actor_critic.actor.act_inference(test_input) + runner_out = runner.alg.actor.act_inference(test_input) + loaded_out = loaded_actor.act_inference(test_input) # compute ONNX Runtime output prediction ort_inputs = {ort_session.get_inputs()[0].name: test_input.cpu().numpy()} From 8dc7b40888d7f32e4154092172ef557608611988 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 11:08:09 -0400 Subject: [PATCH 24/98] Updating all envs and configs to have split actor and critic --- gym/envs/a1/a1_config.py | 35 ++--- .../anymal_c/flat/anymal_c_flat_config.py | 35 ++--- .../mixed_terrains/anymal_c_rough_config.py | 7 +- gym/envs/base/fixed_robot_config.py | 4 +- gym/envs/base/legged_robot_config.py | 23 ++-- gym/envs/cartpole/cartpole_config.py | 23 +++- gym/envs/mini_cheetah/mini_cheetah_config.py | 38 +++--- .../mini_cheetah/mini_cheetah_osc_config.py | 8 +- .../mini_cheetah/mini_cheetah_ref_config.py | 35 +++-- .../mit_humanoid/humanoid_running_config.py | 34 +++-- gym/envs/mit_humanoid/mit_humanoid_config.py | 26 +++- gym/envs/pendulum/pendulum_config.py | 125 ++++++++++++++++++ learning/algorithms/ppo2.py | 2 +- learning/modules/__init__.py | 1 + learning/modules/actor.py | 5 - learning/modules/critic.py | 7 +- learning/runners/BaseRunner.py | 27 ++-- learning/runners/my_runner.py | 18 +-- learning/runners/old_policy_runner.py | 28 ++-- learning/runners/on_policy_runner.py | 29 ++-- learning/utils/PBRS/test_usecase.py | 8 +- scripts/play.py | 4 +- .../test_runner_integration.py | 16 +-- tests/regression_tests/run_n_iterations.py | 2 +- 24 files changed, 363 insertions(+), 177 deletions(-) create mode 100644 gym/envs/pendulum/pendulum_config.py diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index cc19d8f3..190ce41a 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -138,12 +138,11 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -154,7 +153,19 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 + projected_gravity = 0.02 + + class critic: + hidden_dims = [256, 256, 256] + # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -165,16 +176,8 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 1.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 @@ -213,7 +216,7 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "a1" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" # per iteration # (n_steps in Rudin 2021 paper - batch_size = n_steps * n_robots) num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 4ce7f214..95d95be9 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -121,13 +121,12 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -138,7 +137,19 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 # 0.027, 0.14, 0.37 + projected_gravity = 0.02 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [256, 256, 256] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -149,16 +160,8 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 # 0.027, 0.14, 0.37 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 3.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 @@ -196,6 +199,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "flat_anymal_c" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" max_iterations = 1000 # number of policy updates num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py index badc3faa..85b0c42b 100644 --- a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py +++ b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py @@ -56,12 +56,15 @@ class domain_rand(AnymalCFlatCfg.domain_rand): class AnymalCRoughCCfgPPO(AnymalCFlatCfgPPO): - class policy(AnymalCFlatCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] + class actor(AnymalCFlatCfgPPO.actor): + hidden_dims = [128, 64, 32] critic_hidden_dims = [128, 64, 32] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + class critic(AnymalCFlatCfgPPO.critic): + pass + class algorithm(AnymalCFlatCfgPPO.algorithm): entropy_coef = 0.01 diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 4d039f74..8c7427f8 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -125,7 +125,7 @@ class logging: class policy: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -134,7 +134,7 @@ class policy: # rnn_hidden_size = 512 # rnn_num_layers = 1 - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 2955f5a5..7e6cabc6 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -233,24 +233,18 @@ class LeggedRobotRunnerCfg(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] actions = ["q_des"] disable_actions = False @@ -263,6 +257,17 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] + class reward: class weights: tracking_lin_vel = 0.0 diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index d8a19292..f18d5a3b 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -71,11 +71,10 @@ class policy(FixedRobotCfgPPO.policy): init_noise_std = 1.0 num_layers = 2 num_units = 32 - actor_hidden_dims = [num_units] * num_layers - critic_hidden_dims = [num_units] * num_layers + hidden_dims = [num_units] * num_layers activation = "elu" - actor_obs = [ + obs = [ "cart_obs", "pole_trig_obs", "dof_vel", @@ -83,8 +82,6 @@ class policy(FixedRobotCfgPPO.policy): "pole_vel_square", ] - critic_obs = actor_obs - actions = ["tau_ff"] class noise: @@ -94,6 +91,20 @@ class noise: pole_vel = 0.010 actuation = 0.00 + class critic: + num_layers = 2 + num_units = 32 + hidden_dims = [num_units] * num_layers + activation = "elu" + + obs = [ + "cart_obs", + "pole_trig_obs", + "dof_vel", + "cart_vel_square", + "pole_vel_square", + ] + class reward: class weights: pole_pos = 5 @@ -125,7 +136,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 96 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 689fcd64..28c70123 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -126,13 +126,12 @@ class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [128, 64] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -141,19 +140,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_target", ] - critic_obs = [ - "base_height", - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "commands", - "dof_pos_obs", - "dof_vel", - "dof_pos_target", - ] - actions = ["dof_pos_target"] add_noise = True + disable_actions = False class noise: scale = 1.0 @@ -165,8 +154,23 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class critic: + hidden_dims = [128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_target", + ] + + class reward: + class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..a4acefa6 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -161,13 +161,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] + class policy: + hidden_dims = [256, 256, 128] critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -244,5 +244,5 @@ class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "FullSend" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7911206a..56d369c0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,15 +67,14 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -84,7 +83,25 @@ class policy(MiniCheetahRunnerCfg.policy): "phase_obs", ] - critic_obs = [ + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -96,12 +113,6 @@ class policy(MiniCheetahRunnerCfg.policy): "dof_pos_target", ] - actions = ["dof_pos_target"] - disable_actions = False - - class noise: - pass - class reward: class weights: tracking_lin_vel = 4.0 diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 86a113f4..170e2570 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -165,7 +165,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False - disable_actions = False disable_motors = False # (1: disable, 0: enable...bitwise filter) @@ -201,15 +200,14 @@ class HumanoidRunningRunnerCfg(LeggedRobotRunnerCfg): do_wandb = True seed = -1 - class policy(LeggedRobotRunnerCfg.policy): + class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + hidden_dims = [256, 256, 256] # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) activation = "elu" normalize_obs = True # True, False - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -221,10 +219,8 @@ class policy(LeggedRobotRunnerCfg.policy): "in_contact", ] - critic_obs = actor_obs - actions = ["dof_pos_target_legs"] - + disable_actions = False add_noise = True noise_level = 1.0 # scales other values @@ -237,6 +233,24 @@ class noise: dof_vel = 0.01 in_contact = 0.1 + class critic: + hidden_dims = [256, 256, 256] + # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) + activation = "elu" + normalize_obs = True # True, False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "phase_obs", + "dof_pos_legs", + "dof_vel_legs", + "in_contact", + ] + class reward: class weights: # * Behavioral rewards * # @@ -275,8 +289,8 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 24 + algorithm_class_name = "PPO2" + num_steps_per_env = 32 max_iterations = 1000 run_name = "ICRA2023" experiment_name = "HumanoidLocomotion" diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..86a8d606 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -181,14 +181,14 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -198,9 +198,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_history", ] - critic_obs = actor_obs actions = ["dof_pos_target"] + disable_actions = False class noise: base_height = 0.05 @@ -211,6 +211,22 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + ] + class reward: class weights: tracking_ang_vel = 0.5 @@ -248,7 +264,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py new file mode 100644 index 00000000..920356cd --- /dev/null +++ b/gym/envs/pendulum/pendulum_config.py @@ -0,0 +1,125 @@ +import torch + +from gym.envs.base.fixed_robot_config import FixedRobotCfg, FixedRobotCfgPPO + + +class PendulumCfg(FixedRobotCfg): + class env(FixedRobotCfg.env): + num_envs = 2**13 + num_actuators = 1 # 1 for theta connecting base and pole + episode_length_s = 5.0 + + class terrain(FixedRobotCfg.terrain): + pass + + class viewer: + ref_env = 0 + pos = [10.0, 5.0, 10.0] # [m] + lookat = [0.0, 0.0, 0.0] # [m] + + class init_state(FixedRobotCfg.init_state): + default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position + # * "reset_to_range" = uniformly random from a range defined below + reset_mode = "reset_to_range" + + # * initial conditions for reset_to_range + dof_pos_range = { + "theta": [-torch.pi, torch.pi], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(FixedRobotCfg.control): + actuated_joints_mask = [1] # angle + ctrl_frequency = 100 + desired_sim_frequency = 200 + stiffness = {"theta": 0.0} # [N*m/rad] + damping = {"theta": 0.0} # [N*m*s/rad] + + class asset(FixedRobotCfg.asset): + # * Things that differ + file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" + disable_gravity = False + disable_motors = False # all torques set to 0 + joint_damping = 0.1 + + class reward_settings(FixedRobotCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(FixedRobotCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + # * Action scales + tau_ff = 1.0 + + +class PendulumRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "DataLoggingRunner" + + class actor: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + obs = [ + "dof_pos", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + standard_critic_nn = True + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.0 + dof_vel = 0.0 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + # training params + value_loss_coef = 1.0 + use_clipped_value_loss = True + clip_param = 0.2 + entropy_coef = 0.01 + num_learning_epochs = 6 + # * mini batch size = num_envs*nsteps / nminibatches + num_mini_batches = 4 + learning_rate = 1.0e-3 + schedule = "fixed" # could be adaptive, fixed + discount_horizon = 2.0 # [s] + lam = 0.98 + # GAE_bootstrap_horizon = .0 # [s] + desired_kl = 0.01 + max_grad_norm = 1.0 + standard_loss = True + plus_c_penalty = 0.1 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 500 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index bc044d08..98a4904c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -53,7 +53,7 @@ def test_mode(self): self.actor.test() self.critic.test() - def train_mode(self): + def switch_to_train(self): self.actor.train() self.critic.train() diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 75cca5e2..3caf5fec 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -33,3 +33,4 @@ from .actor_critic import ActorCritic from .state_estimator import StateEstimatorNN from .actor import Actor +from .critic import Critic diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 7f40d502..fbaa6868 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -17,11 +17,6 @@ def __init__( normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Actor.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self._normalize_obs = normalize_obs diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 7ebe76c8..0bc890c7 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -8,16 +8,11 @@ class Critic(nn.Module): def __init__( self, num_obs, - hidden_dims, + hidden_dims=None, activation="elu", normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Critic.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self.NN = create_MLP(num_obs, 1, hidden_dims, activation) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index d41e7f07..9e5ab2e3 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import ActorCritic +from learning.modules import Actor, Critic from learning.utils import remove_zero_weighted_rewards @@ -10,28 +10,29 @@ def __init__(self, env, train_cfg, device="cpu"): self.env = env self.parse_train_cfg(train_cfg) - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **self.policy_cfg - ).to(self.device) - - alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 self.log_dir = train_cfg["log_dir"] + self._set_up_alg() + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] - remove_zero_weighted_rewards(train_cfg["policy"]["reward"]["weights"]) - self.policy_cfg = train_cfg["policy"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + self.actor_cfg = train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] def init_storage(self): raise NotImplementedError diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index cdd30400..887df8b2 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -24,9 +24,9 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations rewards_dict self.save() @@ -58,9 +58,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -74,9 +74,9 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["actor_obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -116,9 +116,9 @@ def learn(self): self.save() def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 9a738b93..6d8edf35 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -25,8 +25,8 @@ def learn(self): rewards_dict = {} self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -40,17 +40,17 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -84,21 +84,21 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( @@ -134,16 +134,16 @@ def switch_to_eval(self): self.alg.actor_critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor_critic.actor.act_inference(obs) def export(self, path): self.alg.actor_critic.export_policy(path) def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) self.alg.init_storage( self.env.num_envs, self.num_steps_per_env, diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index b93ff0de..d11fe85d 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -26,9 +26,9 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations rewards_dict self.save() @@ -60,9 +60,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -76,9 +76,10 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -120,29 +121,27 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @@ -172,7 +171,7 @@ def switch_to_eval(self): self.alg.critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor.act_inference(obs) def export(self, path): diff --git a/learning/utils/PBRS/test_usecase.py b/learning/utils/PBRS/test_usecase.py index a84ef547..be5d0412 100644 --- a/learning/utils/PBRS/test_usecase.py +++ b/learning/utils/PBRS/test_usecase.py @@ -30,14 +30,14 @@ def test_PBRS(): logger.initialize(num_envs=1, episode_dt=0.1, total_iterations=100, device="cpu") self_env = MyTask() - # let's create a dummy policy_cfg with just the reward weights - policy_cfg = {"reward": {"weights": {"dummy": 1}}} - policy_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} + # let's create a dummy critic_cfg with just the reward weights + critic_cfg = {"reward": {"weights": {"dummy": 1}}} + critic_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} # and a dummy env PBRS = PotentialBasedRewardShaping( - policy_cfg["reward"]["pbrs_weights"], device="cpu" + critic_cfg["reward"]["pbrs_weights"], device="cpu" ) assert PBRS.get_reward_keys() == ["PBRS_first", "PBRS_second"] diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..4486daae 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -46,9 +46,9 @@ def play(env, runner, train_cfg): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() env.check_exit() diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index ef2fc6bb..f2e607c6 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -27,9 +27,9 @@ def learn_policy(args): def load_saved_policy(runner): - num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor = Actor(num_actor_obs, num_actions, **runner.policy_cfg).to(runner.device) + num_actor_obs = runner.get_obs_size(runner.actor_cfg["obs"]) + num_actions = runner.get_action_size(runner.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **runner.actor_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], @@ -57,7 +57,7 @@ def test_default_integration_settings(self, args): with torch.no_grad(): actions = runner.get_inference_actions() - deployed_actions = runner.env.get_states(runner.policy_cfg["actions"]) + deployed_actions = runner.env.get_states(runner.actor_cfg["actions"]) assert ( torch.equal(actions, torch.zeros_like(actions)) is False ), "Policy returning all zeros" @@ -92,7 +92,7 @@ def test_default_integration_settings(self, args): f"{model_5_path}" "(last iteration) was not saved" ) - obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) + obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"])) actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() @@ -118,7 +118,7 @@ def test_default_integration_settings(self, args): ) # compute torch output with torch.no_grad(): - test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] + test_input = runner.get_obs(runner.actor_cfg["obs"])[0:1] runner_out = runner.alg.actor.act_inference(test_input) loaded_out = loaded_actor.act_inference(test_input) @@ -128,8 +128,8 @@ def test_default_integration_settings(self, args): # compare ONNX Runtime and PyTorch results torch.testing.assert_close( - runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-7 ) torch.testing.assert_close( - runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-7 ) diff --git a/tests/regression_tests/run_n_iterations.py b/tests/regression_tests/run_n_iterations.py index 6a3a4965..29e2f424 100644 --- a/tests/regression_tests/run_n_iterations.py +++ b/tests/regression_tests/run_n_iterations.py @@ -33,7 +33,7 @@ def _run_training_test(output_tensor_file, env, runner): runner.learn() # * get the test values after learning - actions = runner.env.get_states(runner.policy_cfg["actions"]) + actions = runner.env.get_states(runner.actor_cfg["actions"]) # * return the values to the parent for assertion actions.detach().cpu() From 53e92bcd82640effcbde755af93ac43d999bdf3b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 11:13:41 -0400 Subject: [PATCH 25/98] put critic loss_fn into critic module, remove value clipping (very quick testing, clipping value doesn't seem to help) --- gym/envs/a1/a1_config.py | 3 ++- gym/envs/anymal_c/flat/anymal_c_flat_config.py | 4 +++- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/algorithms/ppo2.py | 15 +++------------ learning/modules/critic.py | 3 +++ learning/runners/old_policy_runner.py | 15 +++++++++++++-- learning/runners/on_policy_runner.py | 4 ++++ 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index 190ce41a..87477b56 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -137,6 +137,7 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 256] @@ -216,7 +217,7 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "a1" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" # per iteration # (n_steps in Rudin 2021 paper - batch_size = n_steps * n_robots) num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 95d95be9..d18f7cca 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -85,6 +85,7 @@ class push_robots: toggle = True interval_s = 1 max_push_vel_xy = 0.5 + push_box_dims = [0.2, 0.2, 0.2] class domain_rand(LeggedRobotCfg.domain_rand): randomize_base_mass = True @@ -120,6 +121,7 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 256] @@ -199,6 +201,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "flat_anymal_c" - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" max_iterations = 1000 # number of policy updates num_steps_per_env = 24 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 56d369c0..3c92882d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -140,7 +140,7 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # training params value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True + use_clipped_value_loss = True # deprecate for PPO2 clip_param = 0.2 entropy_coef = 0.01 num_learning_epochs = 6 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 98a4904c..971e6241 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -24,6 +24,7 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, + loss_fn="MSE", device="cpu", **kwargs, ): @@ -35,8 +36,8 @@ def __init__( # * PPO components self.actor = actor.to(self.device) - self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) # * PPO parameters @@ -83,17 +84,7 @@ def update_critic(self, data, last_obs=None): generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: value_batch = self.critic.evaluate(batch["critic_obs"]) - if self.use_clipped_value_loss: - target_value_batch = batch["values"] - value_clipped = target_value_batch + ( - value_batch - target_value_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - target_value_batch).pow(2) - value_losses_clipped = (value_clipped - batch["returns"]).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (batch["returns"] - value_batch).pow(2).mean() - + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 0bc890c7..732f8eda 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -25,3 +25,6 @@ def evaluate(self, critic_observations): with torch.no_grad(): critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() + + def loss_fn(self, input, target): + return nn.functional.mse_loss(input, target, reduction="mean") diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 6d8edf35..42f019a3 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -2,8 +2,9 @@ import torch from learning.utils import Logger - from .BaseRunner import BaseRunner +from learning.algorithms import PPO # noqa: F401 +from learning.modules import ActorCritic, Actor, Critic logger = Logger() @@ -19,6 +20,16 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + actor_critic = ActorCritic(actor, critic) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() @@ -70,7 +81,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index d11fe85d..5085fe91 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -153,6 +153,7 @@ def save(self): "actor_state_dict": self.alg.actor.state_dict(), "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, }, path, @@ -164,6 +165,9 @@ def load(self, path, load_optimizer=True): self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) self.it = loaded_dict["iter"] def switch_to_eval(self): From 06e62e3ea3dad16b351bd97f1e8c0ea4dd9d9292 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 11:29:11 -0400 Subject: [PATCH 26/98] several missing small fixes (started cherry picking one commit late) --- gym/envs/__init__.py | 5 +- gym/envs/base/fixed_robot.py | 14 ++--- .../mit_humanoid/humanoid_running_config.py | 2 +- gym/envs/pendulum/pendulum.py | 43 ++++++++++++++ .../logging_and_saving/wandb_singleton.py | 2 +- learning/algorithms/ppo2.py | 2 +- learning/modules/actor_critic.py | 35 ++---------- learning/modules/utils/normalize.py | 2 +- learning/runners/my_runner.py | 10 +--- learning/runners/on_policy_runner.py | 2 - resources/robots/pendulum/urdf/pendulum.urdf | 56 +++++++++++++++++++ scripts/train.py | 1 + 12 files changed, 121 insertions(+), 53 deletions(-) create mode 100644 gym/envs/pendulum/pendulum.py create mode 100644 resources/robots/pendulum/urdf/pendulum.urdf diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 80ede800..57cbd8c0 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,6 +19,7 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", + "Pendulum": ".pendulum.pendulum", } config_dict = { @@ -30,6 +31,7 @@ "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", + "PendulumCfg": ".pendulum.pendulum_config", } runner_config_dict = { @@ -41,6 +43,7 @@ "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", + "PendulumRunnerCfg": ".pendulum.pendulum_config", } task_dict = { @@ -62,8 +65,8 @@ "HumanoidRunningCfg", "HumanoidRunningRunnerCfg", ], - "a1": ["A1", "A1Cfg", "A1RunnerCfg"], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 99c038d9..1fe4c5d5 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -592,11 +592,11 @@ def _sqrdexp(self, x, scale=1.0): def _reward_torques(self): """Penalize torques""" - return -torch.sum(torch.square(self.torques), dim=1) + return -torch.mean(torch.square(self.torques), dim=1) def _reward_dof_vel(self): """Penalize dof velocities""" - return -torch.sum(torch.square(self.dof_vel), dim=1) + return -torch.mean(torch.square(self.dof_vel), dim=1) def _reward_action_rate(self): """Penalize changes in actions""" @@ -609,7 +609,7 @@ def _reward_action_rate(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" @@ -623,11 +623,11 @@ def _reward_action_rate2(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_collision(self): """Penalize collisions on selected bodies""" - return -torch.sum( + return -torch.mean( 1.0 * ( torch.norm( @@ -649,11 +649,11 @@ def _reward_dof_pos_limits(self): max=0.0 ) # lower limit out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0) - return -torch.sum(out_of_limits, dim=1) + return -torch.mean(out_of_limits, dim=1) def _reward_dof_vel_limits(self): """Penalize dof velocities too close to the limit""" # * clip to max error = 1 rad/s per joint to avoid huge penalties limit = self.cfg.reward_settings.soft_dof_vel_limit error = self.dof_vel.abs() - self.dof_vel_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 170e2570..8f072f79 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -292,7 +292,7 @@ class runner(LeggedRobotRunnerCfg.runner): algorithm_class_name = "PPO2" num_steps_per_env = 32 max_iterations = 1000 - run_name = "ICRA2023" + run_name = "HumanoidRunning" experiment_name = "HumanoidLocomotion" save_interval = 50 plot_input_gradients = False diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py new file mode 100644 index 00000000..95af1fb2 --- /dev/null +++ b/gym/envs/pendulum/pendulum.py @@ -0,0 +1,43 @@ +import torch + +from gym.envs.base.fixed_robot import FixedRobot + + +class Pendulum(FixedRobot): + def _post_physics_step(self): + """Update all states that are not handled in PhysX""" + super()._post_physics_step() + + def _reward_theta(self): + theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] + return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + + def _reward_omega(self): + omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + + def _reward_equilibrium(self): + error = torch.abs(self.dof_state) + error[:, 0] /= self.scales["dof_pos"] + error[:, 1] /= self.scales["dof_vel"] + return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) + # return torch.exp( + # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma + # ) + + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) + + def _reward_energy(self): + m_pendulum = 1.0 + l_pendulum = 1.0 + kinetic_energy = ( + 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + ) + potential_energy = ( + m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + ) + desired_energy = m_pendulum * 9.81 * l_pendulum + energy_error = kinetic_energy + potential_energy - desired_energy + return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/utils/logging_and_saving/wandb_singleton.py b/gym/utils/logging_and_saving/wandb_singleton.py index a1b45b8a..bba14fea 100644 --- a/gym/utils/logging_and_saving/wandb_singleton.py +++ b/gym/utils/logging_and_saving/wandb_singleton.py @@ -7,7 +7,7 @@ class WandbSingleton(object): def __new__(self): if not hasattr(self, "instance"): - self.instance = super(WandbSingleton, self).__new__(self) + self.instance = super().__new__(self) self.entity_name = None self.project_name = None self.experiment_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 971e6241..5866052c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -74,7 +74,7 @@ def update(self, data, last_obs=None): self.update_critic(data) self.update_actor(data) - def update_critic(self, data, last_obs=None): + def update_critic(self, data): self.mean_value_loss = 0 counter = 0 diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index ed837f5e..cec8cbb5 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,44 +1,17 @@ import torch.nn as nn -from .actor import Actor -from .critic import Critic - class ActorCritic(nn.Module): - def __init__( - self, - num_actor_obs, - num_critic_obs, - num_actions, - actor_hidden_dims=[256, 256, 256], - critic_hidden_dims=[256, 256, 256], - activation="elu", - init_noise_std=1.0, - normalize_obs=True, - **kwargs, - ): + def __init__(self, actor, critic, **kwargs): if kwargs: print( "ActorCritic.__init__ got unexpected arguments, " "which will be ignored: " + str([key for key in kwargs.keys()]) ) - super(ActorCritic, self).__init__() - - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) - - self.critic = Critic( - num_critic_obs, critic_hidden_dims, activation, normalize_obs - ) + super().__init__() - print(f"Actor MLP: {self.actor.NN}") - print(f"Critic MLP: {self.critic.NN}") + self.actor = actor + self.critic = critic @property def action_mean(self): diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index d00b69b5..246bafa4 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -4,7 +4,7 @@ class RunningMeanStd(nn.Module): def __init__(self, num_items, epsilon=1e-05): - super(RunningMeanStd, self).__init__() + super().__init__() self.num_items = num_items self.epsilon = epsilon diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 887df8b2..d97dee8d 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -28,7 +28,6 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - rewards_dict self.save() # * start up storage @@ -112,7 +111,6 @@ def learn(self): if self.it % self.save_interval == 0: self.save() - storage.clear() self.save() def set_up_logger(self): @@ -124,10 +122,6 @@ def set_up_logger(self): logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 5085fe91..2cd6a9d8 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -30,7 +30,6 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - rewards_dict self.save() # * start up storage @@ -115,7 +114,6 @@ def learn(self): if self.it % self.save_interval == 0: self.save() - storage.clear() self.save() def update_rewards(self, rewards_dict, terminated): diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf new file mode 100644 index 00000000..90e894e4 --- /dev/null +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/train.py b/scripts/train.py index dfa6afea..283717e8 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -14,6 +14,7 @@ def setup(): env = task_registry.make_env(name=args.task, env_cfg=env_cfg) randomize_episode_counters(env) + randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) From ac6cab1145fbe90c8ba9a5a307b96a3aaf41551f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 15:28:21 -0400 Subject: [PATCH 27/98] initial commit for off-policy (but its still PPO2) --- gym/envs/pendulum/pendulum_config.py | 12 +- learning/algorithms/__init__.py | 1 + learning/algorithms/sac.py | 168 ++++++++++++++++++++++++ learning/runners/__init__.py | 3 +- learning/runners/off_policy_runner.py | 180 ++++++++++++++++++++++++++ 5 files changed, 357 insertions(+), 7 deletions(-) create mode 100644 learning/algorithms/sac.py create mode 100644 learning/runners/off_policy_runner.py diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..7a27e2db 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -27,9 +27,9 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi, torch.pi], + "theta": [-torch.pi / 4, torch.pi / 4], } - dof_vel_range = {"theta": [-5, 5]} + dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "DataLoggingRunner" + runner_class_name = "OffPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -91,9 +91,9 @@ class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.05 dof_vel = 0.0 - torques = 0.025 + torques = 0.01 class termination_weight: termination = 0.0 @@ -121,5 +121,5 @@ class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "SAC" num_steps_per_env = 32 diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index ace1b9fe..78231181 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -33,3 +33,4 @@ from .ppo import PPO from .ppo2 import PPO2 from .SE import StateEstimator +from .sac import SAC \ No newline at end of file diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py new file mode 100644 index 00000000..61b59bd1 --- /dev/null +++ b/learning/algorithms/sac.py @@ -0,0 +1,168 @@ +import torch +import torch.nn as nn +import torch.optim as optim + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, +) + + +class SAC: + def __init__( + self, + actor, + critic, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + loss_fn="MSE", + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + self.actor = actor.to(self.device) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def test_mode(self): + self.actor.test() + self.critic.test() + + def switch_to_train(self): + self.actor.train() + self.critic.train() + + def act(self, obs, critic_obs): + return self.actor.act(obs).detach() + + def update(self, data, last_obs=None): + if last_obs is None: + last_values = None + else: + with torch.no_grad(): + last_values = self.critic.evaluate(last_obs).detach() + compute_generalized_advantages( + data, self.gamma, self.lam, self.critic, last_values + ) + + self.update_critic(data) + self.update_actor(data) + + def update_critic(self, data): + self.mean_value_loss = 0 + counter = 0 + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + value_batch = self.critic.evaluate(batch["critic_obs"]) + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data): + # already done before + # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) + self.mean_surrogate_loss = 0 + counter = 0 + + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + # ! refactor how this is done + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + + ( + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + for param_group in self.optimizer.param_groups: + # ! check this + param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) + ) + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_( + list(self.actor.parameters()) + list(self.critic.parameters()), + self.max_grad_norm, + ) + self.optimizer.step() + self.mean_surrogate_loss += surrogate_loss.item() + counter += 1 + self.mean_surrogate_loss /= counter diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index b0f49217..a0840b06 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -32,4 +32,5 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner -from .old_policy_runner import OldPolicyRunner \ No newline at end of file +from .old_policy_runner import OldPolicyRunner +from .off_policy_runner import OffPolicyRunner \ No newline at end of file diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py new file mode 100644 index 00000000..4a6af887 --- /dev/null +++ b/learning/runners/off_policy_runner.py @@ -0,0 +1,180 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class OffPolicyRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) From df6a655ee87e1afa080e8777c924a33d1b23c0ab Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 18:37:58 -0400 Subject: [PATCH 28/98] pendulum baseline --- gym/envs/base/legged_robot_config.py | 27 ++++++++++------- gym/envs/mini_cheetah/mini_cheetah_config.py | 15 +--------- .../mini_cheetah/mini_cheetah_ref_config.py | 20 ++----------- gym/envs/pendulum/pendulum_config.py | 3 +- learning/algorithms/ppo2.py | 29 +++++++++---------- learning/utils/dict_utils.py | 11 ++++++- 6 files changed, 44 insertions(+), 61 deletions(-) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7e6cabc6..f932ecd2 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -288,25 +288,30 @@ class termination_weight: termination = 0.01 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + mini_batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" - num_steps_per_env = 24 + num_steps_per_env = 24 # deprecate max_iterations = 1500 save_interval = 50 run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 28c70123..37c600a2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -190,20 +190,7 @@ class termination_weight: termination = 0.01 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed - discount_horizon = 1.0 # [s] - # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(LeggedRobotRunnerCfg.runner): run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 3c92882d..b8e167eb 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -138,27 +138,11 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True # deprecate for PPO2 - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 6 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 4 - storage_size = 2**17 # new - mini_batch_size = 2**15 # new - learning_rate = 5.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 # [s] - lam = 0.95 - GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 32 # deprecate diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 7a27e2db..43f37a35 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -84,7 +84,6 @@ class critic: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - standard_critic_nn = True class reward: class weights: @@ -107,7 +106,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): num_learning_epochs = 6 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 1.0e-3 + learning_rate = 1.0e-4 schedule = "fixed" # could be adaptive, fixed discount_horizon = 2.0 # [s] lam = 0.98 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..1463468a 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -13,8 +13,8 @@ def __init__( self, actor, critic, - num_learning_epochs=1, - num_mini_batches=1, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -42,8 +42,8 @@ def __init__( # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam @@ -78,10 +78,11 @@ def update_critic(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: value_batch = self.critic.evaluate(batch["critic_obs"]) value_loss = self.critic.loss_fn(value_batch, batch["returns"]) @@ -94,8 +95,6 @@ def update_critic(self, data): self.mean_value_loss /= counter def update_actor(self, data): - # already done before - # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) self.mean_surrogate_loss = 0 counter = 0 @@ -106,12 +105,12 @@ def update_actor(self, data): data["actions"] ).detach() - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - # ! refactor how this is done self.actor.act(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 2d19e072..eaf7b9db 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -60,10 +60,19 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): # todo change num_epochs to num_batches @torch.no_grad -def create_uniform_generator(data, batch_size, num_epochs, keys=None): +def create_uniform_generator( + data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None +): n, m = data.shape total_data = n * m + + if batch_size > total_data: + batch_size = total_data + num_batches_per_epoch = total_data // batch_size + if max_gradient_steps: + num_epochs = max_gradient_steps // num_batches_per_epoch + num_epochs = max(num_epochs, 1) for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): From 0a20324df89fcd5b7bf5d90afced862134894ce0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 10:20:06 -0400 Subject: [PATCH 29/98] quick fix on storage config --- gym/envs/base/fixed_robot_config.py | 25 ++++++++++++-------- gym/envs/base/legged_robot_config.py | 2 +- gym/envs/pendulum/pendulum_config.py | 34 +++++++++++++++------------- learning/storage/SE_storage.py | 8 +++---- learning/storage/rollout_storage.py | 8 +++---- 5 files changed, 42 insertions(+), 35 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 8c7427f8..fbecced6 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -165,20 +165,25 @@ class termination_weight: termination = 0.0 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index f932ecd2..43b09e68 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -296,7 +296,7 @@ class algorithm: max_grad_steps = 10 # new storage_size = 2**17 # new - mini_batch_size = 2**15 # new + batch_size = 2**15 # new clip_param = 0.2 learning_rate = 1.0e-3 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 43f37a35..4ad6c879 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OffPolicyRunner" + runner_class_name = "OnPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -98,27 +98,29 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor entropy_coef = 0.01 - num_learning_epochs = 6 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-4 - schedule = "fixed" # could be adaptive, fixed - discount_horizon = 2.0 # [s] - lam = 0.98 - # GAE_bootstrap_horizon = .0 # [s] + schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 - max_grad_norm = 1.0 - standard_loss = True - plus_c_penalty = 0.1 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "SAC" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/learning/storage/SE_storage.py b/learning/storage/SE_storage.py index a63f5f9f..0f4eaf41 100644 --- a/learning/storage/SE_storage.py +++ b/learning/storage/SE_storage.py @@ -53,9 +53,9 @@ def clear(self): def mini_batch_generator(self, num_mini_batches, num_epochs=8): """Generate mini batch for learning""" batch_size = self.num_envs * self.num_transitions_per_env - mini_batch_size = batch_size // num_mini_batches + batch_size = batch_size // num_mini_batches indices = torch.randperm( - num_mini_batches * mini_batch_size, + num_mini_batches * batch_size, requires_grad=False, device=self.device, ) @@ -69,8 +69,8 @@ def mini_batch_generator(self, num_mini_batches, num_epochs=8): for epoch in range(num_epochs): for i in range(num_mini_batches): - start = i * mini_batch_size - end = (i + 1) * mini_batch_size + start = i * batch_size + end = (i + 1) * batch_size batch_idx = indices[start:end] obs_batch = observations[batch_idx] diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py index 38a315d7..0f2885a3 100644 --- a/learning/storage/rollout_storage.py +++ b/learning/storage/rollout_storage.py @@ -184,9 +184,9 @@ def get_statistics(self): def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): batch_size = self.num_envs * self.num_transitions_per_env - mini_batch_size = batch_size // num_mini_batches + batch_size = batch_size // num_mini_batches indices = torch.randperm( - num_mini_batches * mini_batch_size, + num_mini_batches * batch_size, requires_grad=False, device=self.device, ) @@ -207,8 +207,8 @@ def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): for epoch in range(num_epochs): for i in range(num_mini_batches): - start = i * mini_batch_size - end = (i + 1) * mini_batch_size + start = i * batch_size + end = (i + 1) * batch_size batch_idx = indices[start:end] obs_batch = observations[batch_idx] From 6fd4ba960ecbfd89d20424bc0ae889597947d335 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 10:45:47 -0400 Subject: [PATCH 30/98] refactor GAE computation to be explicit --- learning/algorithms/ppo2.py | 5 ++++- learning/utils/dict_utils.py | 34 +++++++++++++--------------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 1463468a..08c2f50d 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -67,9 +67,12 @@ def update(self, data, last_obs=None): else: with torch.no_grad(): last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( + + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( data, self.gamma, self.lam, self.critic, last_values ) + data["returns"] = data["advantages"] + data["values"] self.update_critic(data) self.update_actor(data) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index eaf7b9db..d06250d4 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -9,30 +9,28 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): else: last_values = critic.evaluate(data["critic_obs"][-1]) - data.update({"returns": torch.zeros_like(data["rewards"])}) - data["returns"][-1] = last_values * ~data["dones"][-1] + returns = torch.zeros_like(data["rewards"]) + returns[-1] = last_values * ~data["dones"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] - data["returns"][k] = ( - data["rewards"][k] + gamma * data["returns"][k + 1] * not_done - ) - data["returns"] = (data["returns"] - data["returns"].mean()) / ( - data["returns"].std() + 1e-8 - ) - return + returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done + + return normalize(returns) @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - data.update({"values": critic.evaluate(data["critic_obs"])}) +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) - data.update({"advantages": torch.zeros_like(data["values"])}) +@torch.no_grad +def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] - data["advantages"][-1] = ( + advantages[-1] = ( data["rewards"][-1] + gamma * data["values"][-1] * data["timed_out"][-1] + gamma * last_values * not_done @@ -47,15 +45,9 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + gamma * data["values"][k + 1] * not_done - data["values"][k] ) - data["advantages"][k] = ( - td_error + gamma * lam * not_done * data["advantages"][k + 1] - ) - - data["returns"] = data["advantages"] + data["values"] + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( - data["advantages"].std() + 1e-8 - ) + return normalize(advantages) # todo change num_epochs to num_batches From 890d56246d6cf8da36d6f5e6c10aee9afe2b040f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 16:54:16 -0400 Subject: [PATCH 31/98] set up off-policy, and SAC value function update. --- gym/envs/pendulum/pendulum_config.py | 6 +- learning/algorithms/ppo2.py | 5 +- learning/algorithms/sac.py | 111 +++++++++++++++++++------- learning/runners/off_policy_runner.py | 24 +++++- 4 files changed, 105 insertions(+), 41 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 4ad6c879..5e2e0e01 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "OffPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -114,7 +114,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): # Critic use_clipped_value_loss = True # Actor - entropy_coef = 0.01 + entropy_coef = 0.1 schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 @@ -122,5 +122,5 @@ class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "SAC" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 08c2f50d..9d80655c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -160,10 +160,7 @@ def update_actor(self, data): # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), - self.max_grad_norm, - ) + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += surrogate_loss.item() counter += 1 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 61b59bd1..2565b735 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -12,9 +12,10 @@ class SAC: def __init__( self, actor, - critic, - num_learning_epochs=1, - num_mini_batches=1, + critic_1, + critic_2, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -37,26 +38,41 @@ def __init__( # * PPO components self.actor = actor.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) - self.critic = critic.to(self.device) - self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + self.critic_1 = critic_1.to(self.device) + self.critic_2 = critic_2.to(self.device) + self.critic_optimizer = optim.Adam( + list(self.critic_1.parameters()) + list(self.critic_2.parameters()), + lr=learning_rate, + ) # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss + # * SAC parameters + self.learning_starts = 100 + self.batch_size = batch_size + self.tau = 0.005 + self.gamma = 0.99 + self.ent_coef = "fixed" + self.target_entropy = "fixed" + self.mean_surrogate_loss = 0 + self.mean_value_loss = 0 def test_mode(self): self.actor.test() - self.critic.test() + self.critic_1.test() + self.critic_2.test() def switch_to_train(self): self.actor.train() - self.critic.train() + self.critic_1.train() + self.critic_2.train() def act(self, obs, critic_obs): return self.actor.act(obs).detach() @@ -64,30 +80,67 @@ def act(self, obs, critic_obs): def update(self, data, last_obs=None): if last_obs is None: last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values - ) + # else: + # todo + # with torch.no_grad(): + # last_values_1 = self.critic_1.evaluate(last_obs).detach() + # last_values_2 = self.critic_2.evaluate(last_obs).detach() + # last_values = torch.min(last_values_1, last_values_2) + + # data["values"] = self.critic.evaluate(data["critic_obs"]) + # data["advantages"] = compute_generalized_advantages( + # data, self.gamma, self.lam, self.critic, last_values + # ) + # data["returns"] = data["advantages"] + data["values"] self.update_critic(data) + + # todo properly take min over here too? but might not be needed for SAC anyway + obs_actions = torch.cat((data["critic_obs"], data["actions"]), dim=-1) + data["values"] = self.critic_1.evaluate(obs_actions) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic_1, last_values + ) + data["returns"] = data["advantages"] + data["values"] self.update_actor(data) def update_critic(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) - value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + actions = self.actor.act(batch["actor_obs"]) + actions_log_prob = self.actor.get_actions_log_prob(actions) + + obs_action_batch = torch.cat((batch["critic_obs"], actions), dim=-1) + qvalue_1 = self.critic_1.evaluate(obs_action_batch) + qvalue_2 = self.critic_2.evaluate(obs_action_batch) + qvalue_min = torch.min(qvalue_1, qvalue_2) + + targets = ( + self.gamma + * batch["dones"].logical_not() + * (qvalue_min - self.entropy_coef * actions_log_prob) + ) + batch["rewards"] + + value_loss = self.critic_1.loss_fn( + qvalue_1, targets + ) + self.critic_2.loss_fn(qvalue_2, targets) + ##### + # value_batch = self.critic.evaluate(batch["critic_obs"]) + # value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + ##### self.critic_optimizer.zero_grad() value_loss.backward() - nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + nn.utils.clip_grad_norm_( + list(self.critic_1.parameters()) + list(self.critic_2.parameters()), + self.max_grad_norm, + ) self.critic_optimizer.step() self.mean_value_loss += value_loss.item() counter += 1 @@ -106,10 +159,11 @@ def update_actor(self, data): data["actions"] ).detach() - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: # ! refactor how this is done self.actor.act(batch["actor_obs"]) @@ -158,10 +212,7 @@ def update_actor(self, data): # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), - self.max_grad_norm, - ) + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += surrogate_loss.item() counter += 1 diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 4a6af887..81963022 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -5,7 +5,9 @@ from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.modules import Actor, Critic from learning.storage import DictStorage +from learning.algorithms import SAC logger = Logger() storage = DictStorage() @@ -21,6 +23,15 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + self.alg = SAC(actor, critic_1, critic_2, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() @@ -141,7 +152,9 @@ def set_up_logger(self): ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + logger.attach_torch_obj_to_wandb( + (self.alg.actor, self.alg.critic_1, self.alg.critic_2) + ) def save(self): os.makedirs(self.log_dir, exist_ok=True) @@ -149,7 +162,8 @@ def save(self): torch.save( { "actor_state_dict": self.alg.actor.state_dict(), - "critic_state_dict": self.alg.critic.state_dict(), + "critic_1_state_dict": self.alg.critic_1.state_dict(), + "critic_2_state_dict": self.alg.critic_2.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, @@ -160,7 +174,8 @@ def save(self): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"]) + self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.alg.critic_optimizer.load_state_dict( @@ -170,7 +185,8 @@ def load(self, path, load_optimizer=True): def switch_to_eval(self): self.alg.actor.eval() - self.alg.critic.eval() + self.alg.critic_1.eval() + self.alg.critic_2.eval() def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) From 200a14733fa167b95d3e38c4c4937860faf8c096 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 17:37:02 -0400 Subject: [PATCH 32/98] store next_obs explicitly, use for last_values Comment: last_values doesn't seem to have much of an impact on performance at all. However, for off-policy algorithms, having access to the next state (obs) explicitly starts to be a lot handier. Alternative is to do some data juggling to pull it out, which is a bit tedious and probably needs to be stored in a tensor anyway to allow the batch generation --- learning/algorithms/ppo2.py | 10 ++-------- learning/runners/on_policy_runner.py | 4 ++++ learning/utils/dict_utils.py | 4 +++- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 9d80655c..6462fdba 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -61,16 +61,10 @@ def switch_to_train(self): def act(self, obs, critic_obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - + def update(self, data): data["values"] = self.critic.evaluate(data["critic_obs"]) data["advantages"] = compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values + data, self.gamma, self.lam, self.critic ) data["returns"] = data["advantages"] + data["values"] diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..6a4a4265 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,8 +37,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -89,6 +91,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index d06250d4..3b102338 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -24,7 +24,8 @@ def normalize(input, eps=1e-8): @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this @@ -65,6 +66,7 @@ def create_uniform_generator( if max_gradient_steps: num_epochs = max_gradient_steps // num_batches_per_epoch num_epochs = max(num_epochs, 1) + for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): From 96d5abb548e3d1cd7307415ff377f074311e23be Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 18:43:58 -0400 Subject: [PATCH 33/98] refactor critic loss function to consume obs and self-evaluate --- learning/algorithms/ppo2.py | 4 +--- learning/modules/critic.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 6462fdba..ed796400 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -24,7 +24,6 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, - loss_fn="MSE", device="cpu", **kwargs, ): @@ -81,8 +80,7 @@ def update_critic(self, data): max_gradient_steps=self.max_gradient_steps, ) for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) - value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 732f8eda..8ef5395d 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -26,5 +26,5 @@ def evaluate(self, critic_observations): critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() - def loss_fn(self, input, target): - return nn.functional.mse_loss(input, target, reduction="mean") + def loss_fn(self, obs, target): + return nn.functional.mse_loss(self.evaluate(obs), target, reduction="mean") From bf056128a0cd343bb5d2d77e91e6ec210767f4b8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 27 Mar 2024 09:30:17 -0400 Subject: [PATCH 34/98] WIP: should be SAC now, with a lot of rough edges, and missing the tanh squashing --- learning/algorithms/sac.py | 63 +++++++++++---------------- learning/runners/off_policy_runner.py | 4 ++ 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 2565b735..d0ea3906 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -4,7 +4,6 @@ from learning.utils import ( create_uniform_generator, - compute_generalized_advantages, ) @@ -77,35 +76,9 @@ def switch_to_train(self): def act(self, obs, critic_obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - # else: - # todo - # with torch.no_grad(): - # last_values_1 = self.critic_1.evaluate(last_obs).detach() - # last_values_2 = self.critic_2.evaluate(last_obs).detach() - # last_values = torch.min(last_values_1, last_values_2) - - # data["values"] = self.critic.evaluate(data["critic_obs"]) - # data["advantages"] = compute_generalized_advantages( - # data, self.gamma, self.lam, self.critic, last_values - # ) - # data["returns"] = data["advantages"] + data["values"] - - self.update_critic(data) - - # todo properly take min over here too? but might not be needed for SAC anyway - obs_actions = torch.cat((data["critic_obs"], data["actions"]), dim=-1) - data["values"] = self.critic_1.evaluate(obs_actions) - data["advantages"] = compute_generalized_advantages( - data, self.gamma, self.lam, self.critic_1, last_values - ) - data["returns"] = data["advantages"] + data["values"] - self.update_actor(data) - - def update_critic(self, data): + def update(self, data): self.mean_value_loss = 0 + self.mean_surrogate_loss = 0 counter = 0 generator = create_uniform_generator( @@ -114,23 +87,24 @@ def update_critic(self, data): max_gradient_steps=self.max_gradient_steps, ) for batch in generator: - actions = self.actor.act(batch["actor_obs"]) - actions_log_prob = self.actor.get_actions_log_prob(actions) + next_action = self.actor.act(batch["next_actor_obs"]) + next_actions_log_prob = self.actor.get_actions_log_prob(next_action) - obs_action_batch = torch.cat((batch["critic_obs"], actions), dim=-1) - qvalue_1 = self.critic_1.evaluate(obs_action_batch) - qvalue_2 = self.critic_2.evaluate(obs_action_batch) + next_obs_action = torch.cat((batch["next_critic_obs"], next_action), dim=-1) + qvalue_1 = self.critic_1.evaluate(next_obs_action) + qvalue_2 = self.critic_2.evaluate(next_obs_action) qvalue_min = torch.min(qvalue_1, qvalue_2) targets = ( self.gamma * batch["dones"].logical_not() - * (qvalue_min - self.entropy_coef * actions_log_prob) + * (qvalue_min - self.entropy_coef * next_actions_log_prob) ) + batch["rewards"] + obs_action = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) value_loss = self.critic_1.loss_fn( - qvalue_1, targets - ) + self.critic_2.loss_fn(qvalue_2, targets) + obs_action, targets + ) + self.critic_2.loss_fn(obs_action, targets) ##### # value_batch = self.critic.evaluate(batch["critic_obs"]) # value_loss = self.critic.loss_fn(value_batch, batch["returns"]) @@ -144,6 +118,21 @@ def update_critic(self, data): self.critic_optimizer.step() self.mean_value_loss += value_loss.item() counter += 1 + + action = self.actor.act(batch["actor_obs"]) + actions_log_prob = self.actor.get_actions_log_prob(action) + obs_action = torch.cat((batch["critic_obs"], action), dim=-1) + qvalue_1 = self.critic_1.evaluate(obs_action) + qvalue_2 = self.critic_2.evaluate(obs_action) + qvalue_min = torch.min(qvalue_1, qvalue_2) + # update actor + actor_loss = (qvalue_min - self.entropy_coef * actions_log_prob).mean() + self.optimizer.zero_grad() + actor_loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.optimizer.step() + self.mean_surrogate_loss += actor_loss.item() + self.mean_value_loss /= counter def update_actor(self, data): diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 81963022..e70971ed 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -48,8 +48,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -100,6 +102,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, From 692502c8f5fd12909b572cdacc646afdcbe5c368 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 27 Mar 2024 13:30:11 -0400 Subject: [PATCH 35/98] zap useless code --- learning/algorithms/ppo.py | 3 --- learning/algorithms/ppo2.py | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 92898ad5..befeea79 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -105,9 +105,6 @@ def init_storage( self.device, ) - def test_mode(self): - self.actor_critic.test() - def train_mode(self): self.actor_critic.train() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index ed796400..3e5f454d 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -49,15 +49,11 @@ def __init__( self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - def test_mode(self): - self.actor.test() - self.critic.test() - def switch_to_train(self): self.actor.train() self.critic.train() - def act(self, obs, critic_obs): + def act(self, obs): return self.actor.act(obs).detach() def update(self, data): From 5c7d8984d00ccea8bf088e63c3dc1e3adc07dd7c Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 28 Mar 2024 13:50:43 -0400 Subject: [PATCH 36/98] WIP: SAC implementation (based off rsl_rl) runs Does not train, need to tune. --- .../mini_cheetah/mini_cheetah_osc_config.py | 1 + learning/algorithms/sac.py | 372 ++++++++++-------- learning/modules/__init__.py | 1 + learning/modules/chimera_actor.py | 76 ++++ learning/modules/critic.py | 11 +- learning/runners/off_policy_runner.py | 71 ++-- learning/runners/on_policy_runner.py | 4 +- learning/utils/__init__.py | 2 +- learning/utils/utils.py | 6 + 9 files changed, 356 insertions(+), 188 deletions(-) create mode 100644 learning/modules/chimera_actor.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a4acefa6..5aaafde3 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -160,6 +160,7 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" class policy: hidden_dims = [256, 256, 128] diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index d0ea3906..52be75ed 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -2,9 +2,7 @@ import torch.nn as nn import torch.optim as optim -from learning.utils import ( - create_uniform_generator, -) +from learning.utils import create_uniform_generator, polyak_update class SAC: @@ -13,196 +11,252 @@ def __init__( actor, critic_1, critic_2, + target_critic_1, + target_critic_2, batch_size=2**15, max_gradient_steps=10, - clip_param=0.2, - gamma=0.998, - lam=0.95, - entropy_coef=0.0, - learning_rate=1e-3, + action_max=10.0, + action_min=-10.0, + actor_noise_std=1.0, + log_std_max=4.0, + log_std_min=-20.0, + alpha=0.2, + alpha_lr=1e-4, + target_entropy=None, max_grad_norm=1.0, - use_clipped_value_loss=True, - schedule="fixed", - desired_kl=0.01, - loss_fn="MSE", + polyak=0.995, + gamma=0.99, + # clip_param=0.2, # * PPO + # gamma=0.998, + # lam=0.95, + # entropy_coef=0.0, + actor_lr=1e-4, + critic_lr=1e-4, + # max_grad_norm=1.0, + # use_clipped_value_loss=True, + # schedule="fixed", + # desired_kl=0.01, device="cpu", **kwargs, ): self.device = device - self.desired_kl = desired_kl - self.schedule = schedule - self.learning_rate = learning_rate + # self.desired_kl = desired_kl + # self.schedule = schedule + # self.lr = lr # * PPO components self.actor = actor.to(self.device) - self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_1 = critic_1.to(self.device) self.critic_2 = critic_2.to(self.device) - self.critic_optimizer = optim.Adam( - list(self.critic_1.parameters()) + list(self.critic_2.parameters()), - lr=learning_rate, + self.target_critic_1 = target_critic_1.to(self.device) + self.target_critic_2 = target_critic_2.to(self.device) + self.target_critic_1.load_state_dict(self.critic_1.state_dict()) + self.target_critic_2.load_state_dict(self.critic_2.state_dict()) + + self.log_alpha = torch.log(torch.tensor(alpha)).requires_grad_() + + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) + self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr) + self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr) + + # this is probably something to put into the neural network + self.action_max = action_max + self.action_min = action_min + + self.action_delta = (self.action_max - self.action_min) / 2.0 + self.action_offset = (self.action_max + self.action_min) / 2.0 + + self.max_grad_norm = max_grad_norm + self.target_entropy = ( + target_entropy if target_entropy else -self.actor.num_actions ) # * PPO parameters - self.clip_param = clip_param - self.batch_size = batch_size + # self.clip_param = clip_param + # self.batch_size = batch_size self.max_gradient_steps = max_gradient_steps - self.entropy_coef = entropy_coef - self.gamma = gamma - self.lam = lam - self.max_grad_norm = max_grad_norm - self.use_clipped_value_loss = use_clipped_value_loss + # self.entropy_coef = entropy_coef + # self.gamma = gamma + # self.lam = lam + # self.max_grad_norm = max_grad_norm + # self.use_clipped_value_loss = use_clipped_value_loss # * SAC parameters self.learning_starts = 100 self.batch_size = batch_size - self.tau = 0.005 - self.gamma = 0.99 - self.ent_coef = "fixed" - self.target_entropy = "fixed" - self.mean_surrogate_loss = 0 - self.mean_value_loss = 0 - - def test_mode(self): - self.actor.test() - self.critic_1.test() - self.critic_2.test() + self.polyak = polyak + self.gamma = gamma + # self.ent_coef = "fixed" + # self.target_entropy = "fixed" + + @property + def alpha(self): + return self.log_alpha.exp() def switch_to_train(self): self.actor.train() self.critic_1.train() self.critic_2.train() + self.target_critic_1.train() + self.target_critic_2.train() - def act(self, obs, critic_obs): - return self.actor.act(obs).detach() + def switch_to_eval(self): + self.actor.eval() + self.critic_1.eval() + self.critic_2.eval() + self.target_critic_1.eval() + self.target_critic_2.eval() - def update(self, data): - self.mean_value_loss = 0 - self.mean_surrogate_loss = 0 - counter = 0 + def act(self, obs): + mean, std = self.actor.forward(obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() - generator = create_uniform_generator( - data, - self.batch_size, - max_gradient_steps=self.max_gradient_steps, - ) - for batch in generator: - next_action = self.actor.act(batch["next_actor_obs"]) - next_actions_log_prob = self.actor.get_actions_log_prob(next_action) - - next_obs_action = torch.cat((batch["next_critic_obs"], next_action), dim=-1) - qvalue_1 = self.critic_1.evaluate(next_obs_action) - qvalue_2 = self.critic_2.evaluate(next_obs_action) - qvalue_min = torch.min(qvalue_1, qvalue_2) - - targets = ( - self.gamma - * batch["dones"].logical_not() - * (qvalue_min - self.entropy_coef * next_actions_log_prob) - ) + batch["rewards"] - - obs_action = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) - value_loss = self.critic_1.loss_fn( - obs_action, targets - ) + self.critic_2.loss_fn(obs_action, targets) - ##### - # value_batch = self.critic.evaluate(batch["critic_obs"]) - # value_loss = self.critic.loss_fn(value_batch, batch["returns"]) - ##### - self.critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm_( - list(self.critic_1.parameters()) + list(self.critic_2.parameters()), - self.max_grad_norm, - ) - self.critic_optimizer.step() - self.mean_value_loss += value_loss.item() - counter += 1 - - action = self.actor.act(batch["actor_obs"]) - actions_log_prob = self.actor.get_actions_log_prob(action) - obs_action = torch.cat((batch["critic_obs"], action), dim=-1) - qvalue_1 = self.critic_1.evaluate(obs_action) - qvalue_2 = self.critic_2.evaluate(obs_action) - qvalue_min = torch.min(qvalue_1, qvalue_2) - # update actor - actor_loss = (qvalue_min - self.entropy_coef * actions_log_prob).mean() - self.optimizer.zero_grad() - actor_loss.backward() - nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) - self.optimizer.step() - self.mean_surrogate_loss += actor_loss.item() - - self.mean_value_loss /= counter - - def update_actor(self, data): - # already done before - # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) - self.mean_surrogate_loss = 0 - counter = 0 - - self.actor.act(data["actor_obs"]) - data["old_sigma_batch"] = self.actor.action_std.detach() - data["old_mu_batch"] = self.actor.action_mean.detach() - data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( - data["actions"] - ).detach() + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + return actions_scaled + def update(self, data): generator = create_uniform_generator( data, self.batch_size, max_gradient_steps=self.max_gradient_steps, ) + + count = 0 + self.mean_actor_loss = 0 + self.mean_alpha_loss = 0 + self.mean_critic_1_loss = 0 + self.mean_critic_2_loss = 0 + for batch in generator: - # ! refactor how this is done - self.actor.act(batch["actor_obs"]) - actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) - mu_batch = self.actor.action_mean - sigma_batch = self.actor.action_std - entropy_batch = self.actor.entropy - - # * KL - if self.desired_kl is not None and self.schedule == "adaptive": - with torch.inference_mode(): - kl = torch.sum( - torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) - + ( - torch.square(batch["old_sigma_batch"]) - + torch.square(batch["old_mu_batch"] - mu_batch) - ) - / (2.0 * torch.square(sigma_batch)) - - 0.5, - axis=-1, - ) - kl_mean = torch.mean(kl) - - if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) - elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) - - for param_group in self.optimizer.param_groups: - # ! check this - param_group["lr"] = self.learning_rate - - # * Surrogate loss - ratio = torch.exp( - actions_log_prob_batch - - torch.squeeze(batch["old_actions_log_prob_batch"]) + self.update_critic(batch) + self.update_actor_and_alpha(batch) + + # Update Target Networks + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak ) - surrogate = -torch.squeeze(batch["advantages"]) * ratio - surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( - ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak ) - surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() - - loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() - - # * Gradient step - self.optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) - self.optimizer.step() - self.mean_surrogate_loss += surrogate_loss.item() - counter += 1 - self.mean_surrogate_loss /= counter + count += 1 + self.mean_actor_loss /= count + self.mean_alpha_loss /= count + self.mean_critic_1_loss /= count + self.mean_critic_2_loss /= count + + return None + + def update_critic(self, batch): + critic_obs = batch["critic_obs"] + actions = batch["actions"] + rewards = batch["rewards"] + next_actor_obs = batch["next_actor_obs"] + next_critic_obs = batch["next_critic_obs"] + dones = batch["dones"] + + with torch.no_grad(): + # * self._sample_action(actor_next_obs) + mean, std = self.actor.forward(next_actor_obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() + + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + ## * + action_logp = distribution.log_prob(actions).sum(-1) - torch.log( + 1.0 - actions_normalized.pow(2) + 1e-6 + ).sum(-1) + + # * returns target_action = actions_scaled, target_action_logp = action_logp + target_action = actions_scaled + target_action_logp = action_logp + + # * self._critic_input + # ! def should put the action computation into the actor + target_critic_in = torch.cat((next_critic_obs, target_action), dim=-1) + target_critic_prediction_1 = self.target_critic_1.forward(target_critic_in) + target_critic_prediction_2 = self.target_critic_2.forward(target_critic_in) + + target_next = ( + torch.min(target_critic_prediction_1, target_critic_prediction_2) + - self.alpha * target_action_logp + ) + target = rewards + self.gamma * dones.logical_not() * target_next + + critic_in = torch.cat((critic_obs, actions), dim=-1) + + # critic_prediction_1 = self.critic_1.forward(critic_in) + critic_loss_1 = self.critic_1.loss_fn(critic_in, target) + self.critic_1_optimizer.zero_grad() + critic_loss_1.backward() + nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) + self.critic_1_optimizer.step() + + # critic_prediction_2 = self.critic_2.forward(critic_in) + critic_loss_2 = self.critic_2.loss_fn(critic_in, target) + self.critic_2_optimizer.zero_grad() + critic_loss_2.backward() + nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) + self.critic_2_optimizer.step() + + self.mean_critic_1_loss += critic_loss_1.item() + self.mean_critic_2_loss += critic_loss_2.item() + + return + + def update_actor_and_alpha(self, batch): + actor_obs = batch["actor_obs"] + critic_obs = batch["critic_obs"] + + mean, std = self.actor.forward(actor_obs, deterministic=False) + distribution = torch.distributions.Normal(mean, std) + actions = distribution.rsample() + + ## * self._scale_actions(actions, intermediate=True) + actions_normalized = torch.tanh(actions) + # RSL also does a resahpe(-1, self.action_size), not sure why + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + ## * + action_logp = distribution.log_prob(actions).sum(-1) - torch.log( + 1.0 - actions_normalized.pow(2) + 1e-6 + ).sum(-1) + + # * returns target_action = actions_scaled, target_action_logp = action_logp + actor_prediction = actions_scaled + actor_prediction_logp = action_logp + + alpha_loss = ( + -self.log_alpha * (action_logp + self.target_entropy).detach().mean() + ) + + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() + + critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) + q_value_1 = self.critic_1.forward(critic_in) + q_value_2 = self.critic_2.forward(critic_in) + actor_loss = ( + self.alpha.detach() * actor_prediction_logp + - torch.min(q_value_1, q_value_2) + ).mean() + self.actor_optimizer.zero_grad() + actor_loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.actor_optimizer.step() + + self.mean_alpha_loss += alpha_loss.item() + self.mean_actor_loss += actor_loss.item() diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 3caf5fec..8d4a3ab1 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -34,3 +34,4 @@ from .state_estimator import StateEstimatorNN from .actor import Actor from .critic import Critic +from .chimera_actor import ChimeraActor \ No newline at end of file diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py new file mode 100644 index 00000000..5902e3b7 --- /dev/null +++ b/learning/modules/chimera_actor.py @@ -0,0 +1,76 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal +from .utils import create_MLP +from .utils import export_network +from .utils import RunningMeanStd + + +class ChimeraActor(nn.Module): + def __init__( + self, + num_obs, + num_actions, + hidden_dims, + split_idx=1, + activation="tanh", + log_std_max: float = 4.0, + log_std_min: float = -20.0, + std_init: float = 1.0, + normalize_obs=True, + **kwargs, + ): + super().__init__() + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.log_std_max = log_std_max + self.log_std_min = log_std_min + self.log_std_init = torch.tensor([std_init]).log() # refactor + + self.num_obs = num_obs + self.num_actions = num_actions + + assert split_idx < len(hidden_dims) + self.latent_NN = create_MLP( + num_obs, hidden_dims[-split_idx], hidden_dims[:-split_idx], activation + ) + self.mean_NN = create_MLP( + hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + ) + self.std_NN = create_MLP( + hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + ) + + # maybe zap + self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) + Normal.set_default_validate_args = False + + def forward(self, x, deterministic=True): + if self._normalize_obs: + with torch.no_grad(): + x = self.obs_rms(x) + latent = self.latent_NN(x) + mean = self.mean_NN(latent) + if deterministic: + return mean + log_std = self.log_std_init + self.std_NN(latent) + return mean, log_std.clamp(self.log_std_min, self.log_std_max).exp() + + def act(self, x): + mean, std = self.forward(x, deterministic=False) + self.distribution = Normal(mean, std) + return self.distribution.sample() + + def inference_policy(self, x): + return self.forward(x, deterministic=True) + + def export(self, path): + export_network(self.inference_policy, "policy", path, self.num_obs) + + def to(self, device): + super().to(device) + self.log_std_init = self.log_std_init.to(device) + return self diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 8ef5395d..96c53438 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -20,11 +20,14 @@ def __init__( if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) - def evaluate(self, critic_observations): + def forward(self, x): if self._normalize_obs: with torch.no_grad(): - critic_observations = self.obs_rms(critic_observations) - return self.NN(critic_observations).squeeze() + x = self.obs_rms(x) + return self.NN(x).squeeze() + + def evaluate(self, critic_observations): + return self.forward(critic_observations) def loss_fn(self, obs, target): - return nn.functional.mse_loss(self.evaluate(obs), target, reduction="mean") + return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index e70971ed..fa03e319 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -5,7 +5,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner -from learning.modules import Actor, Critic +from learning.modules import Critic, ChimeraActor from learning.storage import DictStorage from learning.algorithms import SAC @@ -27,10 +27,20 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg) critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) - self.alg = SAC(actor, critic_1, critic_2, device=self.device, **self.alg_cfg) + target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + self.alg = SAC( + actor, + critic_1, + critic_2, + target_critic_1, + target_critic_2, + device=self.device, + **self.alg_cfg, + ) def learn(self): self.set_up_logger() @@ -49,7 +59,7 @@ def learn(self): { "actor_obs": actor_obs, "next_actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], @@ -70,7 +80,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -152,9 +162,16 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + [ + "mean_critic_1_loss", + "mean_critic_2_loss", + "mean_actor_loss", + "mean_alpha_loss", + ], ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb( (self.alg.actor, self.alg.critic_1, self.alg.critic_2) @@ -163,27 +180,37 @@ def set_up_logger(self): def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) - torch.save( - { - "actor_state_dict": self.alg.actor.state_dict(), - "critic_1_state_dict": self.alg.critic_1.state_dict(), - "critic_2_state_dict": self.alg.critic_2.state_dict(), - "optimizer_state_dict": self.alg.optimizer.state_dict(), - "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), - "iter": self.it, - }, - path, - ) + save_dict = { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_1_state_dict": self.alg.critic_1.state_dict(), + "critic_2_state_dict": self.alg.critic_2.state_dict(), + "log_alpha": self.alg.log_alpha, + "actor_optimizer_state_dict": self.alg.actor_optimizer.state_dict(), + "critic_1_optimizer_state_dict": self.alg.critic_1_optimizer.state_dict(), + "critic_2_optimizer_state_dict": self.alg.critic_2_optimizer.state_dict(), + "log_alpha_optimizer_state_dict": self.alg.log_alpha_optimizer.state_dict(), + "iter": self.it, + } + torch.save(save_dict, path) def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"]) self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"]) + self.log_alpha = loaded_dict["log_alpha"] if load_optimizer: - self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) - self.alg.critic_optimizer.load_state_dict( - loaded_dict["critic_optimizer_state_dict"] + self.alg.actor_optimizer.load_state_dict( + loaded_dict["actor_optimizer_state_dict"] + ) + self.alg.critic_1_optimizer.load_state_dict( + loaded_dict["critic_1_optimizer_state_dict"] + ) + self.alg.critic_2_optimizer.load_state_dict( + loaded_dict["critic_2_optimizer_state_dict"] + ) + self.alg.log_alpha_optimizer.load_state_dict( + loaded_dict["log_alpha_optimizer_state_dict"] ) self.it = loaded_dict["iter"] @@ -194,7 +221,7 @@ def switch_to_eval(self): def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) - return self.alg.actor.act_inference(obs) + return self.alg.act(obs) # todo inference mode def export(self, path): self.alg.actor.export(path) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 6a4a4265..844f58a5 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -38,7 +38,7 @@ def learn(self): { "actor_obs": actor_obs, "next_actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], @@ -59,7 +59,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index 15c674d6..b39dc16d 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,7 +1,7 @@ - from .utils import ( remove_zero_weighted_rewards, set_discount_from_horizon, + polyak_update ) from .dict_utils import * from .logger import Logger diff --git a/learning/utils/utils.py b/learning/utils/utils.py index 7a54c65a..d4eac980 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -17,3 +17,9 @@ def set_discount_from_horizon(dt, horizon): discount_factor = 1 - 1 / discrete_time_horizon return discount_factor + + +def polyak_update(online, target, polyak_factor): + for op, tp in zip(online.parameters(), target.parameters()): + tp.data.copy_((1.0 - polyak_factor) * op.data + polyak_factor * tp.data) + return target From c542214c5cb863acdfe111a46b156ec00f64ea2f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 28 Mar 2024 18:27:13 -0400 Subject: [PATCH 37/98] update fixed_robot to match legged_robot refactor --- gym/envs/base/fixed_robot.py | 92 ++++++++++++---------------- gym/envs/pendulum/pendulum.py | 4 -- gym/envs/pendulum/pendulum_config.py | 4 +- 3 files changed, 40 insertions(+), 60 deletions(-) diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 1fe4c5d5..c3cf0566 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): self.reset() def step(self): - """Apply actions, simulate, call self.post_physics_step() - and pre_physics_step() - - Args: - actions (torch.Tensor): Tensor of shape - (num_envs, num_actions_per_env) - """ - self._reset_buffers() - self._pre_physics_step() - # * step physics and render each frame + self._pre_decimation_step() self._render() for _ in range(self.cfg.control.decimation): + self._pre_compute_torques() self.torques = self._compute_torques() - - if self.cfg.asset.disable_motors: - self.torques[:] = 0.0 - torques_to_gym_tensor = torch.zeros( - self.num_envs, self.num_dof, device=self.device - ) - - # todo encapsulate - next_torques_idx = 0 - for dof_idx in range(self.num_dof): - if self.cfg.control.actuated_joints_mask[dof_idx]: - torques_to_gym_tensor[:, dof_idx] = self.torques[ - :, next_torques_idx - ] - next_torques_idx += 1 - else: - torques_to_gym_tensor[:, dof_idx] = torch.zeros( - self.num_envs, device=self.device - ) - - self.gym.set_dof_actuation_force_tensor( - self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor) - ) - self.gym.simulate(self.sim) - if self.device == "cpu": - self.gym.fetch_results(self.sim, True) - self.gym.refresh_dof_state_tensor(self.sim) - - self._post_physics_step() + self._post_compute_torques() + self._step_physx_sim() + self._post_physx_step() + self._post_decimation_step() self._check_terminations_and_timeouts() env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten() self._reset_idx(env_ids) - def _pre_physics_step(self): - pass + def _pre_decimation_step(self): + return None + + def _pre_compute_torques(self): + return None + + def _post_compute_torques(self): + if self.cfg.asset.disable_motors: + self.torques[:] = 0.0 - def _post_physics_step(self): + def _step_physx_sim(self): + next_torques_idx = 0 + torques_to_gym_tensor = torch.zeros( + self.num_envs, self.num_dof, device=self.device + ) + for dof_idx in range(self.num_dof): + if self.cfg.control.actuated_joints_mask[dof_idx]: + torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx] + next_torques_idx += 1 + else: + torques_to_gym_tensor[:, dof_idx] = torch.zeros( + self.num_envs, device=self.device + ) + self.gym.set_dof_actuation_force_tensor( + self.sim, gymtorch.unwrap_tensor(self.torques) + ) + self.gym.simulate(self.sim) + if self.device == "cpu": + self.gym.fetch_results(self.sim, True) + self.gym.refresh_dof_state_tensor(self.sim) + + def _post_physx_step(self): """ check terminations, compute observations and rewards """ self.gym.refresh_actor_root_state_tensor(self.sim) self.gym.refresh_net_contact_force_tensor(self.sim) + def _post_decimation_step(self): self.episode_length_buf += 1 self.common_step_counter += 1 @@ -212,18 +208,6 @@ def _process_rigid_body_props(self, props, env_id): return props def _compute_torques(self): - """Compute torques from actions. - Actions can be interpreted as position or velocity targets given - to a PD controller, or directly as scaled torques. - [NOTE]: torques must have the same dimension as the number of DOFs, - even if some DOFs are not actuated. - - Args: - actions (torch.Tensor): Actions - - Returns: - [torch.Tensor]: Torques sent to the simulation - """ actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 95af1fb2..77b05f10 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -4,10 +4,6 @@ class Pendulum(FixedRobot): - def _post_physics_step(self): - """Update all states that are not handled in PhysX""" - super()._post_physics_step() - def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 5e2e0e01..befc134d 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OffPolicyRunner" + runner_class_name = "OnPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -122,5 +122,5 @@ class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates - algorithm_class_name = "SAC" + algorithm_class_name = "PPO2" num_steps_per_env = 32 From a5ca4fd0f7030debff27d8440725b868a921232b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 28 Mar 2024 18:58:49 -0400 Subject: [PATCH 38/98] WIP: separate config for pendulum SAC, comments on missing components --- gym/envs/__init__.py | 4 +- gym/envs/pendulum/pendulum_SAC_config.py | 72 ++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 gym/envs/pendulum/pendulum_SAC_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 57cbd8c0..824ac8ce 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -44,6 +44,7 @@ "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", + "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config", } task_dict = { @@ -66,7 +67,8 @@ "HumanoidRunningRunnerCfg", ], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], - "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], + "sac_pendulum": ["Pendulum", "PendulumCfg", "PendulumSACRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py new file mode 100644 index 00000000..840346ee --- /dev/null +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -0,0 +1,72 @@ +from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO + + +class PendulumSACRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "OffPolicyRunner" + + class actor: + hidden_dims = [128, 64, 32, 32] + split_idx = 2 + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + obs = [ + "dof_pos", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.05 + dof_vel = 0.0 + torques = 0.01 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + batch_size = 2**17 + max_gradient_steps = 10 + action_max = 5.0 + action_min = -5.0 + actor_noise_std = 1.0 + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -1.0 + max_grad_norm = 1.0 + polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) + gamma = 0.98 + alpha_lr = 1e-4 + actor_lr = 1e-5 + critic_lr = 1e-5 + # gSDE parameters missing: batch_size = 256!!!, but batch_size ~2**17 + # warm-up steps + # auto entropy coefficient (alpha) + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 500 # number of policy updates + algorithm_class_name = "SAC" + num_steps_per_env = 32 From d86162e57f3d2c64ed28f1a4a172bc3b7a90efbd Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 29 Mar 2024 13:04:45 -0400 Subject: [PATCH 39/98] WIP: tuning, critic loss higher than expected --- gym/envs/mit_humanoid/mit_humanoid.py | 12 ++--- gym/envs/mit_humanoid/mit_humanoid_config.py | 48 ++++++++++---------- learning/algorithms/ppo2.py | 13 ++---- learning/runners/on_policy_runner.py | 4 ++ learning/utils/dict_utils.py | 23 ++++------ 5 files changed, 48 insertions(+), 52 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 489b463d..118ac170 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -172,12 +172,9 @@ def _reward_dof_vel(self): ) * self._switch("stand") def _reward_dof_near_home(self): - return torch.mean( - self._sqrdexp( - (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] - ), - dim=1, - ) + return self._sqrdexp( + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] + ).mean(dim=1) def _reward_stand_still(self): """Penalize motion at zero commands""" @@ -233,4 +230,5 @@ def _reward_hips_forward(self): hip_yaw_abad /= torch.stack( (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 ) - return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) + return (hip_yaw_abad).pow(2).mean(dim=1) + # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index a3e113b6..033409e0 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -117,20 +117,20 @@ class control(LeggedRobotCfg.control): filter_gain = 0.1586 # 1: no filtering, 0: wall class oscillator: - base_frequency = 2.0 # [Hz] + base_frequency = 3.0 # [Hz] class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 4.0] # min max [m/s] [-0.75, 0.75] - lin_vel_y = 0.35 # max [m/s] - yaw_vel = 0.5 # max [rad/s] + lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.0 # max [m/s] + yaw_vel = 0.0 # max [rad/s] class push_robots: toggle = True interval_s = 1 - max_push_vel_xy = 0.6 + max_push_vel_xy = 0.5 push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] class domain_rand: @@ -155,18 +155,18 @@ class asset(LeggedRobotCfg.asset): # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) default_dof_drive_mode = 3 - fix_base_link = False + fix_base_link = True disable_gravity = False disable_motors = False total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): - soft_dof_pos_limit = 0.9 - soft_dof_vel_limit = 0.9 - soft_torque_limit = 0.9 + soft_dof_pos_limit = 0.8 + soft_dof_vel_limit = 0.8 + soft_torque_limit = 0.8 max_contact_force = 1500.0 base_height_target = BASE_HEIGHT_REF - tracking_sigma = 0.5 + tracking_sigma = 0.25 # a smooth switch based on |cmd| (commanded velocity). switch_scale = 0.5 @@ -229,28 +229,28 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): class policy(LeggedRobotRunnerCfg.policy): disable_actions = False init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + actor_hidden_dims = [512, 512, 128] + critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" actor_obs = [ "base_height", "base_lin_vel", "base_ang_vel", "projected_gravity", - "commands", + # "commands", "dof_pos_obs", "dof_vel", "dof_pos_history", - "sampled_history_dof_pos", - "sampled_history_dof_vel", - "sampled_history_dof_pos_target", + # "sampled_history_dof_pos", + # "sampled_history_dof_vel", + # "sampled_history_dof_pos_target", "oscillator_obs", ] critic_obs = actor_obs - actions = ["dof_pos_target", "oscillator_freq"] + actions = ["dof_pos_target"] class noise: dof_pos = 0.005 @@ -262,23 +262,23 @@ class noise: class reward: class weights: - # tracking_lin_vel = 3.0 + # tracking_lin_vel = 0.0 # tracking_ang_vel = 1.5 # orientation = 1.0 torques = 5.0e-4 # min_base_height = 1.5 - action_rate = 0.01 - action_rate2 = 0.001 + action_rate = 1e-3 + action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 # dof_vel = 0.25 # stand_still = 0.25 dof_pos_limits = 0.25 - # dof_near_home = 0.25 + dof_near_home = 0.25 stance = 1.0 swing = 1.0 - hips_forward = 0.5 - walk_freq = 2.5 + hips_forward = 0.0 + walk_freq = 0.0 # 2.5 class termination_weight: termination = 15 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 69d6dcd2..efdce668 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -65,15 +65,12 @@ def train_mode(self): def act(self, obs, critic_obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values + def update(self, data): + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic ) + data["returns"] = data["advantages"] + data["values"] self.update_critic(data) self.update_actor(data) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 91e2ac26..6350eaa5 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -38,8 +38,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -89,6 +91,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 2d19e072..4f4ff420 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -23,16 +23,14 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - data.update({"values": critic.evaluate(data["critic_obs"])}) - - data.update({"advantages": torch.zeros_like(data["values"])}) - +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) + advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] - data["advantages"][-1] = ( + advantages[-1] = ( data["rewards"][-1] + gamma * data["values"][-1] * data["timed_out"][-1] + gamma * last_values * not_done @@ -47,15 +45,14 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + gamma * data["values"][k + 1] * not_done - data["values"][k] ) - data["advantages"][k] = ( - td_error + gamma * lam * not_done * data["advantages"][k + 1] - ) + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - data["returns"] = data["advantages"] + data["values"] + return normalize(advantages) - data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( - data["advantages"].std() + 1e-8 - ) + +@torch.no_grad +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) # todo change num_epochs to num_batches From cf5cecdb3b7ac2c192e959d7e3f832040741aec8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 1 Apr 2024 10:01:32 -0400 Subject: [PATCH 40/98] WIP: partly fill replay buffer before training, bugfix on generator, small fixes --- gym/envs/pendulum/pendulum_SAC_config.py | 7 +++- gym/envs/pendulum/pendulum_config.py | 2 +- learning/runners/off_policy_runner.py | 50 +++++++++++++++++++++++- learning/storage/__init__.py | 3 +- learning/storage/replay_buffer.py | 48 +++++++++++++++++++++++ learning/utils/dict_utils.py | 3 ++ scripts/train.py | 2 - 7 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 learning/storage/replay_buffer.py diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 840346ee..a6fe30d7 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -45,7 +45,9 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - batch_size = 2**17 + initial_fill = 1000 + storage_size = 10**17 + batch_size = 256 max_gradient_steps = 10 action_max = 5.0 action_min = -5.0 @@ -69,4 +71,5 @@ class runner(FixedRobotCfgPPO.runner): experiment_name = "pendulum" max_iterations = 500 # number of policy updates algorithm_class_name = "SAC" - num_steps_per_env = 32 + save_interval = 10 + num_steps_per_env = 100 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index befc134d..253e4165 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -33,7 +33,7 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 + ctrl_frequency = 20 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index fa03e319..380b12cd 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -6,11 +6,11 @@ from .BaseRunner import BaseRunner from learning.modules import Critic, ChimeraActor -from learning.storage import DictStorage +from learning.storage import ReplayBuffer from learning.algorithms import SAC logger = Logger() -storage = DictStorage() +storage = ReplayBuffer() class OffPolicyRunner(BaseRunner): @@ -73,6 +73,52 @@ def learn(self): device=self.device, ) + # fill buffer + for _ in range(self.alg_cfg["initial_fill"]): + with torch.inference_mode(): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + # print every 10% of initial fill + if _ % (self.alg_cfg["initial_fill"] // 10) == 0: + print(f"Filled {100 * _ / self.alg_cfg['initial_fill']}%") + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index f10df85f..47f2593b 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,4 +3,5 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage -from .dict_storage import DictStorage \ No newline at end of file +from .dict_storage import DictStorage +from .replay_buffer import ReplayBuffer \ No newline at end of file diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py new file mode 100644 index 00000000..889cc69b --- /dev/null +++ b/learning/storage/replay_buffer.py @@ -0,0 +1,48 @@ +import torch +from tensordict import TensorDict + + +class ReplayBuffer: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) + self.fill_count = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + else: + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + + @torch.inference_mode + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length: + self.fill_count = 0 + self.data[self.fill_count] = transition + self.fill_count += 1 + + def clear(self): + self.fill_count = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 3b102338..7e5c6c79 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -60,10 +60,13 @@ def create_uniform_generator( total_data = n * m if batch_size > total_data: + Warning("Batch size is larger than total data, using available data only.") batch_size = total_data num_batches_per_epoch = total_data // batch_size if max_gradient_steps: + if max_gradient_steps < num_batches_per_epoch: + num_batches_per_epoch = max_gradient_steps num_epochs = max_gradient_steps // num_batches_per_epoch num_epochs = max(num_epochs, 1) diff --git a/scripts/train.py b/scripts/train.py index 283717e8..759992cf 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -12,8 +12,6 @@ def setup(): task_registry.make_gym_and_sim() wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) - randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) From 82e2179b8e71604550836cdb757cdb8f2afce9db Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 1 Apr 2024 18:14:45 -0400 Subject: [PATCH 41/98] switch back to elu, remove grad clipping --- gym/envs/pendulum/pendulum_SAC_config.py | 20 ++++++++++---------- gym/envs/pendulum/pendulum_config.py | 6 +++--- learning/algorithms/sac.py | 22 ++++++++++++++-------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index a6fe30d7..e8726faa 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -6,10 +6,10 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor: - hidden_dims = [128, 64, 32, 32] + hidden_dims = [128, 64, 32] split_idx = 2 # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" obs = [ "dof_pos", @@ -28,16 +28,16 @@ class critic: "dof_pos", "dof_vel", ] - hidden_dims = [128, 64, 32] + hidden_dims = [256, 256] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" class reward: class weights: theta = 0.0 omega = 0.0 - equilibrium = 1.0 - energy = 0.05 + equilibrium = 10.0 + energy = 0.5 dof_vel = 0.0 torques = 0.01 @@ -48,7 +48,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): initial_fill = 1000 storage_size = 10**17 batch_size = 256 - max_gradient_steps = 10 + max_gradient_steps = 1 # 10 # SB3: 1 action_max = 5.0 action_min = -5.0 actor_noise_std = 1.0 @@ -59,9 +59,9 @@ class algorithm(FixedRobotCfgPPO.algorithm): max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.98 - alpha_lr = 1e-4 - actor_lr = 1e-5 - critic_lr = 1e-5 + alpha_lr = 3e-4 + actor_lr = 3e-4 + critic_lr = 3e-4 # gSDE parameters missing: batch_size = 256!!!, but batch_size ~2**17 # warm-up steps # auto entropy coefficient (alpha) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 253e4165..7944e5b2 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -33,7 +33,7 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 20 + ctrl_frequency = 100 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -43,7 +43,7 @@ class asset(FixedRobotCfg.asset): file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" disable_gravity = False disable_motors = False # all torques set to 0 - joint_damping = 0.1 + joint_damping = 0.5 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 1.0 + tau_ff = 5.0 class PendulumRunnerCfg(FixedRobotCfgPPO): diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 52be75ed..3c418474 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -1,5 +1,6 @@ import torch -import torch.nn as nn + +# import torch.nn as nn import torch.optim as optim from learning.utils import create_uniform_generator, polyak_update @@ -174,8 +175,12 @@ def update_critic(self, batch): actions_normalized * self.action_delta + self.action_offset ).clamp(self.action_min, self.action_max) ## * - action_logp = distribution.log_prob(actions).sum(-1) - torch.log( - 1.0 - actions_normalized.pow(2) + 1e-6 + # action_logp = distribution.log_prob(actions).sum(-1) - torch.log( + # 1.0 - actions_normalized.pow(2) + 1e-6 + # ).sum(-1) + action_logp = ( + distribution.log_prob(actions) + - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) ).sum(-1) # * returns target_action = actions_scaled, target_action_logp = action_logp @@ -200,14 +205,14 @@ def update_critic(self, batch): critic_loss_1 = self.critic_1.loss_fn(critic_in, target) self.critic_1_optimizer.zero_grad() critic_loss_1.backward() - nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) + # nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.max_grad_norm) self.critic_1_optimizer.step() # critic_prediction_2 = self.critic_2.forward(critic_in) critic_loss_2 = self.critic_2.loss_fn(critic_in, target) self.critic_2_optimizer.zero_grad() critic_loss_2.backward() - nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) + # nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.max_grad_norm) self.critic_2_optimizer.step() self.mean_critic_1_loss += critic_loss_1.item() @@ -239,8 +244,9 @@ def update_actor_and_alpha(self, batch): actor_prediction_logp = action_logp alpha_loss = ( - -self.log_alpha * (action_logp + self.target_entropy).detach().mean() - ) + -self.log_alpha * (action_logp + self.target_entropy).detach() + ).mean() + # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() self.log_alpha_optimizer.zero_grad() alpha_loss.backward() @@ -255,7 +261,7 @@ def update_actor_and_alpha(self, batch): ).mean() self.actor_optimizer.zero_grad() actor_loss.backward() - nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() self.mean_alpha_loss += alpha_loss.item() From 7da0fab0a336b15f0d87d81e6a60e16f401238dd Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 4 Apr 2024 09:11:22 -0400 Subject: [PATCH 42/98] match URDF, obs, ctrl freq, etc to gymansium --- gym/envs/pendulum/pendulum.py | 14 ++++++++++++++ gym/envs/pendulum/pendulum_SAC_config.py | 2 +- gym/envs/pendulum/pendulum_config.py | 14 +++++++------- resources/robots/pendulum/urdf/pendulum.urdf | 4 ++-- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 77b05f10..b9004c86 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -4,6 +4,20 @@ class Pendulum(FixedRobot): + def _init_buffers(self): + super()._init_buffers() + self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) + + def _post_decimation_step(self): + super()._post_decimation_step() + self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) + + def _reset_system(self, env_ids): + super()._reset_system(env_ids) + self.dof_pos_obs[env_ids] = torch.cat( + [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + ) + def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index e8726faa..d03bdc52 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -12,7 +12,7 @@ class actor: activation = "elu" obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 7944e5b2..5f21cf9a 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -27,14 +27,14 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi / 4, torch.pi / 4], + "theta": [-torch.pi, torch.pi], } dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 - desired_sim_frequency = 200 + ctrl_frequency = 20 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 5.0 + tau_ff = 2.0 class PendulumRunnerCfg(FixedRobotCfgPPO): @@ -62,7 +62,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" obs = [ "dof_pos", @@ -83,14 +83,14 @@ class critic: ] hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.05 + energy = 0.0 dof_vel = 0.0 torques = 0.01 diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf index 90e894e4..d94bffea 100644 --- a/resources/robots/pendulum/urdf/pendulum.urdf +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -47,8 +47,8 @@ - - + + From 1feafdecf7d941f86a872c89efa2c9b51bdea717 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 4 Apr 2024 09:59:32 -0400 Subject: [PATCH 43/98] uniform sampling during initial fill, proper inference actions for play --- gym/envs/pendulum/pendulum_SAC_config.py | 4 ++-- gym/envs/pendulum/pendulum_config.py | 2 +- learning/algorithms/sac.py | 4 ++-- learning/runners/off_policy_runner.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index d03bdc52..2ba4c611 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -49,8 +49,8 @@ class algorithm(FixedRobotCfgPPO.algorithm): storage_size = 10**17 batch_size = 256 max_gradient_steps = 1 # 10 # SB3: 1 - action_max = 5.0 - action_min = -5.0 + action_max = 1.0 + action_min = -1.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 5f21cf9a..ea0b1fcc 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_basic" # * initial conditions for reset_to_range dof_pos_range = { diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 3c418474..24f09587 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -16,8 +16,8 @@ def __init__( target_critic_2, batch_size=2**15, max_gradient_steps=10, - action_max=10.0, - action_min=-10.0, + action_max=1.0, + action_min=-1.0, actor_noise_std=1.0, log_std_max=4.0, log_std_min=-20.0, diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 380b12cd..926683ac 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -50,6 +50,7 @@ def learn(self): self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) + actions = self.alg.act(actor_obs) tot_iter = self.it + self.num_learning_iterations self.save() @@ -76,7 +77,7 @@ def learn(self): # fill buffer for _ in range(self.alg_cfg["initial_fill"]): with torch.inference_mode(): - actions = self.alg.act(actor_obs) + actions = torch.rand_like(actions) * 2 - 1 self.set_actions( self.actor_cfg["actions"], actions, @@ -267,7 +268,12 @@ def switch_to_eval(self): def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) - return self.alg.act(obs) # todo inference mode + mean = self.alg.actor.forward(obs) # todo inference mode + actions = torch.tanh(mean) + actions = (actions * self.alg.action_delta + self.alg.action_offset).clamp( + self.alg.action_min, self.alg.action_max + ) + return actions def export(self, path): self.alg.actor.export(path) From 1969e0deab0aa458bd7c9ee97ee05c4fa2d5abb7 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 5 Apr 2024 09:16:47 -0400 Subject: [PATCH 44/98] WIP: fix polyak bug, some more tuning --- gym/envs/base/fixed_robot_config.py | 2 +- gym/envs/base/legged_robot_config.py | 2 +- gym/envs/pendulum/pendulum_SAC_config.py | 17 ++++++----- gym/envs/pendulum/pendulum_config.py | 32 ++++++++++---------- learning/algorithms/sac.py | 30 +++++++++--------- learning/runners/off_policy_runner.py | 1 + resources/robots/pendulum/urdf/pendulum.urdf | 4 +-- 7 files changed, 45 insertions(+), 43 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index fbecced6..4ced4366 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -170,7 +170,7 @@ class algorithm: lam = 0.95 # shared batch_size = 2**15 - max_grad_steps = 10 + max_gradient_steps = 10 # new storage_size = 2**17 # new batch_size = 2**15 # new diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 43b09e68..4ce1bc8f 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -293,7 +293,7 @@ class algorithm: lam = 0.95 # shared batch_size = 2**15 - max_grad_steps = 10 + max_gradient_steps = 10 # new storage_size = 2**17 # new batch_size = 2**15 # new diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 2ba4c611..8071d7e4 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -11,11 +11,11 @@ class actor: # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + normalize_obs = False obs = [ "dof_pos_obs", "dof_vel", ] - actions = ["tau_ff"] disable_actions = False @@ -25,30 +25,31 @@ class noise: class critic: obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] hidden_dims = [256, 256] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + normalize_obs = False class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 10.0 - energy = 0.5 + energy = 1.0 dof_vel = 0.0 - torques = 0.01 + torques = 0.001 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 1000 - storage_size = 10**17 + initial_fill = 10**3 + storage_size = 10**6 # 17 batch_size = 256 - max_gradient_steps = 1 # 10 # SB3: 1 + max_gradient_steps = 100 # 10 # SB3: 1 action_max = 1.0 action_min = -1.0 actor_noise_std = 1.0 @@ -72,4 +73,4 @@ class runner(FixedRobotCfgPPO.runner): max_iterations = 500 # number of policy updates algorithm_class_name = "SAC" save_interval = 10 - num_steps_per_env = 100 + num_steps_per_env = 64 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index ea0b1fcc..b274192c 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -23,18 +23,18 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_basic" + reset_mode = "reset_to_range" # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi, torch.pi], + "theta": [-torch.pi / 4, torch.pi / 4], } dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 20 - desired_sim_frequency = 100 + ctrl_frequency = 100 + desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -43,7 +43,7 @@ class asset(FixedRobotCfg.asset): file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" disable_gravity = False disable_motors = False # all torques set to 0 - joint_damping = 0.5 + joint_damping = 0.1 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 2.0 + tau_ff = 1.0 class PendulumRunnerCfg(FixedRobotCfgPPO): @@ -62,7 +62,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" obs = [ "dof_pos", @@ -83,14 +83,14 @@ class critic: ] hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.05 dof_vel = 0.0 torques = 0.01 @@ -100,22 +100,22 @@ class termination_weight: class algorithm(FixedRobotCfgPPO.algorithm): # both gamma = 0.99 - lam = 0.95 + discount_horizon = 2.0 + lam = 0.98 # shared - batch_size = 2**15 - max_grad_steps = 10 + max_gradient_steps = 10 # new storage_size = 2**17 # new - batch_size = 2**15 # new + batch_size = 2**16 # new clip_param = 0.2 - learning_rate = 1.0e-3 + learning_rate = 1.0e-4 max_grad_norm = 1.0 # Critic use_clipped_value_loss = True # Actor - entropy_coef = 0.1 - schedule = "adaptive" # could be adaptive, fixed + entropy_coef = 0.01 + schedule = "fixed" # could be adaptive, fixed desired_kl = 0.01 class runner(FixedRobotCfgPPO.runner): diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 24f09587..7305d786 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -139,14 +139,14 @@ def update(self, data): self.update_critic(batch) self.update_actor_and_alpha(batch) - # Update Target Networks - self.target_critic_1 = polyak_update( - self.critic_1, self.target_critic_1, self.polyak - ) - self.target_critic_1 = polyak_update( - self.critic_1, self.target_critic_1, self.polyak - ) count += 1 + # Update Target Networks + self.target_critic_1 = polyak_update( + self.critic_1, self.target_critic_1, self.polyak + ) + self.target_critic_2 = polyak_update( + self.critic_2, self.target_critic_2, self.polyak + ) self.mean_actor_loss /= count self.mean_alpha_loss /= count self.mean_critic_1_loss /= count @@ -243,14 +243,14 @@ def update_actor_and_alpha(self, batch): actor_prediction = actions_scaled actor_prediction_logp = action_logp - alpha_loss = ( - -self.log_alpha * (action_logp + self.target_entropy).detach() - ).mean() - # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() + # alpha_loss = ( + # -self.log_alpha * (action_logp + self.target_entropy).detach() + # ).mean() + # # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() - self.log_alpha_optimizer.zero_grad() - alpha_loss.backward() - self.log_alpha_optimizer.step() + # self.log_alpha_optimizer.zero_grad() + # alpha_loss.backward() + # self.log_alpha_optimizer.step() critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) q_value_1 = self.critic_1.forward(critic_in) @@ -264,5 +264,5 @@ def update_actor_and_alpha(self, batch): # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() - self.mean_alpha_loss += alpha_loss.item() + # self.mean_alpha_loss += alpha_loss.item() self.mean_actor_loss += actor_loss.item() diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 926683ac..31556725 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -216,6 +216,7 @@ def set_up_logger(self): "mean_critic_2_loss", "mean_actor_loss", "mean_alpha_loss", + "alpha", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf index d94bffea..90e894e4 100644 --- a/resources/robots/pendulum/urdf/pendulum.urdf +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -47,8 +47,8 @@ - - + + From b69f8c077355c3c147dadd45c4ff946211ebff52 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 8 Apr 2024 14:51:17 -0400 Subject: [PATCH 45/98] normalize advantages only after computing returns --- gym/envs/pendulum/pendulum_config.py | 2 +- learning/algorithms/ppo2.py | 3 ++- learning/utils/dict_utils.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index b274192c..b25e0455 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -103,7 +103,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): discount_horizon = 2.0 lam = 0.98 # shared - max_gradient_steps = 10 + max_gradient_steps = 24 # new storage_size = 2**17 # new batch_size = 2**16 # new diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 3e5f454d..09c60ab9 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -5,6 +5,7 @@ from learning.utils import ( create_uniform_generator, compute_generalized_advantages, + normalize, ) @@ -62,8 +63,8 @@ def update(self, data): data, self.gamma, self.lam, self.critic ) data["returns"] = data["advantages"] + data["values"] - self.update_critic(data) + data["advantages"] = normalize(data["advantages"]) self.update_actor(data) def update_critic(self, data): diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 7e5c6c79..99534c22 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -48,7 +48,7 @@ def compute_generalized_advantages(data, gamma, lam, critic): ) advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - return normalize(advantages) + return advantages # todo change num_epochs to num_batches From 7db3f141a0f27938d5f9cf624adbecbda80c13c8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 11 Apr 2024 12:44:14 -0400 Subject: [PATCH 46/98] WIP: some tuning and fixes --- gym/envs/pendulum/pendulum_config.py | 6 +++--- learning/algorithms/sac.py | 21 ++++++++++++++------- learning/modules/chimera_actor.py | 2 ++ learning/runners/off_policy_runner.py | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index b25e0455..d13931d9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**13 + num_envs = 2**8 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_basic" # * initial conditions for reset_to_range dof_pos_range = { @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 1.0 + tau_ff = 10.0 class PendulumRunnerCfg(FixedRobotCfgPPO): diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 7305d786..c079c1f6 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -84,7 +84,6 @@ def __init__( # self.max_grad_norm = max_grad_norm # self.use_clipped_value_loss = use_clipped_value_loss # * SAC parameters - self.learning_starts = 100 self.batch_size = batch_size self.polyak = polyak self.gamma = gamma @@ -195,8 +194,9 @@ def update_critic(self, batch): target_next = ( torch.min(target_critic_prediction_1, target_critic_prediction_2) - - self.alpha * target_action_logp + - self.alpha.detach() * target_action_logp ) + # the detach inside torch.no_grad() should be redundant target = rewards + self.gamma * dones.logical_not() * target_next critic_in = torch.cat((critic_obs, actions), dim=-1) @@ -243,14 +243,21 @@ def update_actor_and_alpha(self, batch): actor_prediction = actions_scaled actor_prediction_logp = action_logp + # entropy loss + alpha_loss = -( + self.log_alpha * (action_logp + self.target_entropy).detach() + ).mean() + # alpha_loss = ( # -self.log_alpha * (action_logp + self.target_entropy).detach() # ).mean() - # # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() + # alpha_loss = ( + # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() + # ) - # self.log_alpha_optimizer.zero_grad() - # alpha_loss.backward() - # self.log_alpha_optimizer.step() + self.log_alpha_optimizer.zero_grad() + alpha_loss.backward() + self.log_alpha_optimizer.step() critic_in = torch.cat((critic_obs, actor_prediction), dim=-1) q_value_1 = self.critic_1.forward(critic_in) @@ -264,5 +271,5 @@ def update_actor_and_alpha(self, batch): # nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() - # self.mean_alpha_loss += alpha_loss.item() + self.mean_alpha_loss += alpha_loss.item() self.mean_actor_loss += actor_loss.item() diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index 5902e3b7..76cb1b1e 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -44,6 +44,8 @@ def __init__( hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation ) + # nn.init.orthogonal_() + # maybe zap self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) Normal.set_default_validate_args = False diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 31556725..cd93b39e 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -269,7 +269,7 @@ def switch_to_eval(self): def get_inference_actions(self): obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) - mean = self.alg.actor.forward(obs) # todo inference mode + mean = self.alg.actor.forward(obs) actions = torch.tanh(mean) actions = (actions * self.alg.action_delta + self.alg.action_offset).clamp( self.alg.action_min, self.alg.action_max From fe0a489e9e8ebfd1627fcdb2cb709dc8055f3017 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 11 Apr 2024 18:13:30 -0400 Subject: [PATCH 47/98] WIP: some more reasonable tuning for single env --- gym/envs/pendulum/pendulum_SAC_config.py | 19 ++++++++----------- gym/envs/pendulum/pendulum_config.py | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 8071d7e4..186e78c6 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -46,31 +46,28 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 10**3 + initial_fill = 256 storage_size = 10**6 # 17 batch_size = 256 - max_gradient_steps = 100 # 10 # SB3: 1 + max_gradient_steps = 10 # 10 # SB3: 1 action_max = 1.0 action_min = -1.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 alpha = 0.2 - target_entropy = -1.0 + target_entropy = 1.0 # -1.0 max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.98 - alpha_lr = 3e-4 - actor_lr = 3e-4 - critic_lr = 3e-4 - # gSDE parameters missing: batch_size = 256!!!, but batch_size ~2**17 - # warm-up steps - # auto entropy coefficient (alpha) + alpha_lr = 3e-5 + actor_lr = 3e-5 + critic_lr = 3e-5 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 500 # number of policy updates algorithm_class_name = "SAC" - save_interval = 10 - num_steps_per_env = 64 + save_interval = 50 + num_steps_per_env = 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index d13931d9..9aadb7d5 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**8 + num_envs = 1 # 2**8 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 From 38087410e05709436e2526c1edfdcd6018afa90a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 12 Apr 2024 13:14:33 -0400 Subject: [PATCH 48/98] refactor chimera actor hidden layers --- gym/envs/pendulum/pendulum_SAC_config.py | 15 +++++++++++---- gym/envs/pendulum/pendulum_config.py | 2 +- learning/modules/chimera_actor.py | 17 ++++++++++++----- learning/runners/off_policy_runner.py | 3 +++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 186e78c6..e473e668 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -6,10 +6,17 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor: - hidden_dims = [128, 64, 32] - split_idx = 2 + hidden_dims = { + "latent": [128, 128], + "mean": [64, 32], + "std": [64, 32], + } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = { + "latent": "elu", + "mean": "elu", + "std": "tanh", + } normalize_obs = False obs = [ @@ -46,7 +53,7 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 256 + initial_fill = 2 storage_size = 10**6 # 17 batch_size = 256 max_gradient_steps = 10 # 10 # SB3: 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 9aadb7d5..d13931d9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 1 # 2**8 + num_envs = 2**8 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index 76cb1b1e..f2cad4de 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -12,7 +12,6 @@ def __init__( num_obs, num_actions, hidden_dims, - split_idx=1, activation="tanh", log_std_max: float = 4.0, log_std_min: float = -20.0, @@ -33,15 +32,23 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - assert split_idx < len(hidden_dims) self.latent_NN = create_MLP( - num_obs, hidden_dims[-split_idx], hidden_dims[:-split_idx], activation + num_obs, + hidden_dims["latent"][-1], + hidden_dims["latent"][:-1], + activation["latent"], ) self.mean_NN = create_MLP( - hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + hidden_dims["latent"][-1], + num_actions, + hidden_dims["mean"], + activation["mean"], ) self.std_NN = create_MLP( - hidden_dims[-split_idx], num_actions, hidden_dims[split_idx:], activation + hidden_dims["latent"][-1], + num_actions, + hidden_dims["std"], + activation["std"], ) # nn.init.orthogonal_() diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index cd93b39e..b8e5e652 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -32,6 +32,9 @@ def _set_up_alg(self): critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + print(actor) + self.alg = SAC( actor, critic_1, From 5636bfcc370a0dca4bed7ba8f81c02cd9df26510 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 12 Apr 2024 13:21:56 -0400 Subject: [PATCH 49/98] quickfix of init_fill prinout --- learning/runners/off_policy_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index b8e5e652..b9269234 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -120,7 +120,9 @@ def learn(self): ) storage.add_transitions(transition) # print every 10% of initial fill - if _ % (self.alg_cfg["initial_fill"] // 10) == 0: + if (self.alg_cfg["initial_fill"] > 10) and ( + _ % (self.alg_cfg["initial_fill"] // 10) == 0 + ): print(f"Filled {100 * _ / self.alg_cfg['initial_fill']}%") logger.tic("runtime") From 7448aa602c9e8365575c071b5fef21e572630a46 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 08:44:12 -0400 Subject: [PATCH 50/98] update setup info --- setup.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 8a3685da..06709c4e 100644 --- a/setup.py +++ b/setup.py @@ -2,19 +2,16 @@ from distutils.core import setup setup( - name="pkGym", - version="1.0.1", + name="QGym", + version="1.0.3", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), description="Isaac Gym environments", install_requires=[ "isaacgym", - "matplotlib", - "pandas", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", - "numpy>=1.16.4", ], ) From cb5175283b2603aefc6429083ecc155c4766b549 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 07:47:05 -0400 Subject: [PATCH 51/98] fix replay buffer - data was not being added correctly - runner was indexing along num_envs instead of time - use data getter to handle overfill --- learning/runners/off_policy_runner.py | 5 ++--- learning/storage/replay_buffer.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index b9269234..e57be9a9 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -73,7 +73,7 @@ def learn(self): storage.initialize( transition, self.env.num_envs, - self.env.num_envs * self.num_steps_per_env, + self.alg_cfg["storage_size"], device=self.device, ) @@ -179,8 +179,7 @@ def learn(self): logger.toc("collection") logger.tic("learning") - self.alg.update(storage.data) - storage.clear() + self.alg.update(storage.get_data()) logger.toc("learning") logger.log_all_categories() diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py index 889cc69b..de252f8a 100644 --- a/learning/storage/replay_buffer.py +++ b/learning/storage/replay_buffer.py @@ -19,6 +19,7 @@ def initialize( self.max_length = max_length self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) self.fill_count = 0 + self.add_index = 0 for key in dummy_dict.keys(): if dummy_dict[key].dim() == 1: # if scalar @@ -36,13 +37,18 @@ def initialize( @torch.inference_mode def add_transitions(self, transition: TensorDict): - if self.fill_count >= self.max_length: - self.fill_count = 0 - self.data[self.fill_count] = transition + if self.fill_count >= self.max_length and self.add_index >= self.max_length: + self.add_index = 0 + self.data[self.add_index] = transition self.fill_count += 1 + self.add_index += 1 + + def get_data(self): + return self.data[: max(self.fill_count, self.max_length), :] def clear(self): self.fill_count = 0 + self.add_index = 0 with torch.inference_mode(): for tensor in self.data: tensor.zero_() From eda4827727f7ad8ebfda6fb4f4f454dee4073a8c Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 08:49:17 -0400 Subject: [PATCH 52/98] update configs to be closer to SB3 --- gym/envs/pendulum/pendulum.py | 3 ++- gym/envs/pendulum/pendulum_SAC_config.py | 16 ++++++++-------- gym/envs/pendulum/pendulum_config.py | 12 ++++++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index b9004c86..c7d9be99 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -50,4 +50,5 @@ def _reward_energy(self): ) desired_energy = m_pendulum * 9.81 * l_pendulum energy_error = kinetic_energy + potential_energy - desired_energy - return self._sqrdexp(energy_error / desired_energy) + return -(energy_error / desired_energy).pow(2) + # return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index e473e668..39fdf7da 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -8,14 +8,14 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = { "latent": [128, 128], - "mean": [64, 32], - "std": [64, 32], + "mean": [32], + "std": [32], } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = { "latent": "elu", "mean": "elu", - "std": "tanh", + "std": "elu", } normalize_obs = False @@ -35,7 +35,7 @@ class critic: "dof_pos_obs", "dof_vel", ] - hidden_dims = [256, 256] + hidden_dims = [64, 64] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = False @@ -53,16 +53,16 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 2 + initial_fill = 100 storage_size = 10**6 # 17 - batch_size = 256 - max_gradient_steps = 10 # 10 # SB3: 1 + batch_size = 256 # 4096 + max_gradient_steps = 1 # 10 # SB3: 1 action_max = 1.0 action_min = -1.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 - alpha = 0.2 + alpha = 0.8 target_entropy = 1.0 # -1.0 max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index d13931d9..2c42aedb 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**8 + num_envs = 1 # 4096 num_actuators = 1 # 1 for theta connecting base and pole - episode_length_s = 5.0 + episode_length_s = 25.0 class terrain(FixedRobotCfg.terrain): pass @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -27,13 +27,13 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi / 4, torch.pi / 4], + "theta": [-torch.pi / 2, torch.pi / 2], } dof_vel_range = {"theta": [-1, 1]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 + ctrl_frequency = 25 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -52,7 +52,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 10.0 + tau_ff = 5.0 class PendulumRunnerCfg(FixedRobotCfgPPO): From 67f75cc7702055e397c91c539dfcb670c2aa5f10 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 09:02:31 -0400 Subject: [PATCH 53/98] derp, min instead of max -_- derp --- gym/envs/pendulum/pendulum_SAC_config.py | 2 +- learning/storage/replay_buffer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 39fdf7da..ab16bf24 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -63,7 +63,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): log_std_max = 4.0 log_std_min = -20.0 alpha = 0.8 - target_entropy = 1.0 # -1.0 + target_entropy = -1.0 max_grad_norm = 1.0 polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.98 diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py index de252f8a..350a2c25 100644 --- a/learning/storage/replay_buffer.py +++ b/learning/storage/replay_buffer.py @@ -44,7 +44,7 @@ def add_transitions(self, transition: TensorDict): self.add_index += 1 def get_data(self): - return self.data[: max(self.fill_count, self.max_length), :] + return self.data[: min(self.fill_count, self.max_length), :] def clear(self): self.fill_count = 0 From 4eb0966b1adef3ebc75220a96d1d816af430cc04 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 12:04:41 -0400 Subject: [PATCH 54/98] split environment config for SAC pendulum --- gym/envs/__init__.py | 5 ++-- gym/envs/pendulum/pendulum_SAC_config.py | 31 ++++++++++++++++++++++++ gym/envs/pendulum/pendulum_config.py | 6 ++--- scripts/play.py | 6 ++--- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 824ac8ce..f57f4247 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,7 +19,7 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "Pendulum": ".pendulum.pendulum", + "Pendulum": ".pendulum.pendulum" } config_dict = { @@ -32,6 +32,7 @@ "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", "PendulumCfg": ".pendulum.pendulum_config", + "PendulumSACCfg": ".pendulum.pendulum_SAC_config", } runner_config_dict = { @@ -68,7 +69,7 @@ ], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], - "sac_pendulum": ["Pendulum", "PendulumCfg", "PendulumSACRunnerCfg"], + "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index ab16bf24..e9f73c57 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -1,4 +1,35 @@ +import torch from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.pendulum.pendulum_config import PendulumCfg + + +class PendulumSACCfg(PendulumCfg): + class env(PendulumCfg.env): + num_envs = 1 + episode_length_s = 5.0 + + class init_state(PendulumCfg.init_state): + reset_mode = "reset_to_basic" + default_joint_angles = {"theta": 0.0} + dof_pos_range = { + "theta": [-torch.pi / 2, torch.pi / 2], + } + dof_vel_range = {"theta": [-1, 1]} + + class control(PendulumCfg.control): + ctrl_frequency = 25 + desired_sim_frequency = 200 + + class asset(PendulumCfg.asset): + joint_damping = 0.1 + + class reward_settings(PendulumCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(PendulumCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + tau_ff = 5.0 class PendulumSACRunnerCfg(FixedRobotCfgPPO): diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 2c42aedb..49534568 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,8 +5,8 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 1 # 4096 - num_actuators = 1 # 1 for theta connecting base and pole + num_envs = 4096 + num_actuators = 1 episode_length_s = 25.0 class terrain(FixedRobotCfg.terrain): @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_basic" + reset_mode = "reset_to_range" # * initial conditions for reset_to_range dof_pos_range = { diff --git a/scripts/play.py b/scripts/play.py index 4486daae..d832bf69 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -9,16 +9,16 @@ def setup(args): env_cfg, train_cfg = task_registry.create_cfgs(args) - env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) + env_cfg.env.num_envs = 32 if hasattr(env_cfg, "push_robots"): env_cfg.push_robots.toggle = False if hasattr(env_cfg, "commands"): env_cfg.commands.resampling_time = 9999 - env_cfg.env.episode_length_s = 9999 + env_cfg.env.episode_length_s = 5 env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() + env_cfg.init_state.reset_mode = "reset_to_range" env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) From 5ee76742c6c6e78cb33003cb1cf5f191641965aa Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 22 Apr 2024 11:55:38 -0400 Subject: [PATCH 55/98] WIP: random tweaks and tuning before switching to other stuff --- gym/envs/base/task_skeleton.py | 2 +- gym/envs/pendulum/pendulum.py | 4 --- gym/envs/pendulum/pendulum_SAC_config.py | 26 +++++++++--------- learning/algorithms/sac.py | 24 ++++++++++++++--- learning/modules/chimera_actor.py | 34 +++++++++++------------- learning/runners/off_policy_runner.py | 5 ++++ scripts/play.py | 2 +- 7 files changed, 57 insertions(+), 40 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index ffbac6ce..6a1b352c 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -67,7 +67,7 @@ def _eval_reward(self, name): def _check_terminations_and_timeouts(self): """Check if environments need to be reset""" contact_forces = self.contact_forces[:, self.termination_contact_indices, :] - self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) self.timed_out = self.episode_length_buf >= self.max_episode_length self.to_be_reset = self.timed_out | self.terminated diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index c7d9be99..c22c034a 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -35,10 +35,6 @@ def _reward_equilibrium(self): # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma # ) - def _reward_torques(self): - """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) - def _reward_energy(self): m_pendulum = 1.0 l_pendulum = 1.0 diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index e9f73c57..aa182e15 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -5,8 +5,8 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): - num_envs = 1 - episode_length_s = 5.0 + num_envs = 256 + episode_length_s = 2.5 class init_state(PendulumCfg.init_state): reset_mode = "reset_to_basic" @@ -29,7 +29,7 @@ class reward_settings(PendulumCfg.reward_settings): class scaling(PendulumCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi - tau_ff = 5.0 + tau_ff = 1.0 class PendulumSACRunnerCfg(FixedRobotCfgPPO): @@ -38,9 +38,9 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = { - "latent": [128, 128], - "mean": [32], - "std": [32], + "latent": [400], + "mean": [300], + "std": [300], } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = { @@ -66,25 +66,25 @@ class critic: "dof_pos_obs", "dof_vel", ] - hidden_dims = [64, 64] + hidden_dims = [256, 256] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = False + normalize_obs = True class reward: class weights: theta = 0.0 omega = 0.0 - equilibrium = 10.0 + equilibrium = 2.0 energy = 1.0 dof_vel = 0.0 - torques = 0.001 + torques = 0.01 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 100 + initial_fill = 10**3 storage_size = 10**6 # 17 batch_size = 256 # 4096 max_gradient_steps = 1 # 10 # SB3: 1 @@ -105,7 +105,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates + max_iterations = 10000 # number of policy updates algorithm_class_name = "SAC" - save_interval = 50 + save_interval = 250 num_steps_per_env = 1 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index c079c1f6..f7bd7510 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -90,6 +90,13 @@ def __init__( # self.ent_coef = "fixed" # self.target_entropy = "fixed" + self.test_input = torch.randn(256, 3, device=self.device) + self.test_actions = torch.zeros(256, 1, device=self.device) + self.test_action_mean = torch.zeros(256, device=self.device) + self.test_action_std = torch.zeros(256, device=self.device) + self.test_action_max = torch.zeros(256, device=self.device) + self.test_action_min = torch.zeros(256, device=self.device) + @property def alpha(self): return self.log_alpha.exp() @@ -112,8 +119,6 @@ def act(self, obs): mean, std = self.actor.forward(obs, deterministic=False) distribution = torch.distributions.Normal(mean, std) actions = distribution.rsample() - - ## * self._scale_actions(actions, intermediate=True) actions_normalized = torch.tanh(actions) # RSL also does a resahpe(-1, self.action_size), not sure why actions_scaled = ( @@ -121,6 +126,14 @@ def act(self, obs): ).clamp(self.action_min, self.action_max) return actions_scaled + def act_inference(self, obs): + mean = self.actor.forward(obs, deterministic=True) + actions_normalized = torch.tanh(mean) + actions_scaled = ( + actions_normalized * self.action_delta + self.action_offset + ).clamp(self.action_min, self.action_max) + return actions_scaled + def update(self, data): generator = create_uniform_generator( data, @@ -150,7 +163,12 @@ def update(self, data): self.mean_alpha_loss /= count self.mean_critic_1_loss /= count self.mean_critic_2_loss /= count - + with torch.inference_mode(): + self.test_actions = self.act_inference(self.test_input).cpu() + self.test_action_mean = self.test_actions.mean().item() + self.test_action_std = self.test_actions.std().item() + self.test_action_max = self.test_actions.max().item() + self.test_action_min = self.test_actions.min().item() return None def update_critic(self, batch): diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index f2cad4de..1addc83d 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -12,10 +12,10 @@ def __init__( num_obs, num_actions, hidden_dims, - activation="tanh", - log_std_max: float = 4.0, - log_std_min: float = -20.0, - std_init: float = 1.0, + activation, + std_init=1.0, + log_std_max=4.0, + log_std_min=-20.0, normalize_obs=True, **kwargs, ): @@ -33,26 +33,24 @@ def __init__( self.num_actions = num_actions self.latent_NN = create_MLP( - num_obs, - hidden_dims["latent"][-1], - hidden_dims["latent"][:-1], - activation["latent"], + num_inputs=num_obs, + num_outputs=hidden_dims["latent"][-1], + hidden_dims=hidden_dims["latent"][:-1], + activation=activation["latent"], ) self.mean_NN = create_MLP( - hidden_dims["latent"][-1], - num_actions, - hidden_dims["mean"], - activation["mean"], + num_inputs=hidden_dims["latent"][-1], + num_outputs=num_actions, + hidden_dims=hidden_dims["mean"], + activation=activation["mean"], ) self.std_NN = create_MLP( - hidden_dims["latent"][-1], - num_actions, - hidden_dims["std"], - activation["std"], + num_inputs=hidden_dims["latent"][-1], + num_outputs=num_actions, + hidden_dims=hidden_dims["std"], + activation=activation["std"], ) - # nn.init.orthogonal_() - # maybe zap self.distribution = Normal(torch.zeros(num_actions), torch.ones(num_actions)) Normal.set_default_validate_args = False diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index e57be9a9..2a7bc66e 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -221,6 +221,11 @@ def set_up_logger(self): "mean_actor_loss", "mean_alpha_loss", "alpha", + # "test_actions", + "test_action_mean", + "test_action_std", + "test_action_max", + "test_action_min", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) diff --git a/scripts/play.py b/scripts/play.py index d832bf69..37882f7e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -14,7 +14,7 @@ def setup(args): env_cfg.push_robots.toggle = False if hasattr(env_cfg, "commands"): env_cfg.commands.resampling_time = 9999 - env_cfg.env.episode_length_s = 5 + env_cfg.env.episode_length_s = 50 env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() env_cfg.init_state.reset_mode = "reset_to_range" From e10c8360a9c17c22b1e88fe5a802ef7cce1eba7a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 26 Apr 2024 16:15:57 -0400 Subject: [PATCH 56/98] burn in normalization before learning --- learning/algorithms/ppo2.py | 2 +- learning/runners/on_policy_runner.py | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..db56ef8c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -58,7 +58,7 @@ def switch_to_train(self): self.actor.train() self.critic.train() - def act(self, obs, critic_obs): + def act(self, obs): return self.actor.act(obs).detach() def update(self, data, last_obs=None): diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..60dff1e9 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,7 +37,7 @@ def learn(self): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), @@ -50,6 +50,10 @@ def learn(self): device=self.device, ) + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -57,7 +61,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -116,6 +120,21 @@ def learn(self): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( From acedd9719defd776d0ff5728926cf31c737c9b0f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 26 Apr 2024 16:23:41 -0400 Subject: [PATCH 57/98] fix configs wrt to normalization, and fixed_base --- gym/envs/base/fixed_robot_config.py | 27 +++++++++---------- gym/envs/base/legged_robot_config.py | 3 +-- gym/envs/cartpole/cartpole_config.py | 2 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 3 ++- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +-- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 ++ gym/envs/pendulum/pendulum_config.py | 9 ++++--- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 8c7427f8..b5b5e88d 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - # only for 'ActorCriticRecurrent': - # rnn_type = 'lstm' - # rnn_hidden_size = 512 - # rnn_num_layers = 1 - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False class noise: - noise = 0.1 # implement as needed, also in your robot class + observation_a = 0.1 # implement as needed, also in your robot class + + class critic: + hidden_dims = [512, 256, 128] + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] class rewards: class weights: diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7e6cabc6..2dd2083d 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -238,13 +238,12 @@ class actor: hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] + normalize_obs = True actions = ["q_des"] disable_actions = False diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index f18d5a3b..6e073d5c 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(FixedRobotCfgPPO.policy): + class actor(FixedRobotCfgPPO.actor): init_noise_std = 1.0 num_layers = 2 num_units = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 28c70123..2f9bf540 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,7 +130,6 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - obs = [ "base_lin_vel", "base_ang_vel", @@ -140,6 +139,7 @@ class actor: "dof_vel", "dof_pos_target", ] + normalize_obs = True actions = ["dof_pos_target"] add_noise = True disable_actions = False @@ -168,6 +168,7 @@ class critic: "dof_vel", "dof_pos_target", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 3c92882d..d89715fc 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,7 +73,6 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True obs = [ "base_ang_vel", "projected_gravity", @@ -82,6 +81,7 @@ class actor: "dof_vel", "phase_obs", ] + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -100,7 +100,6 @@ class critic: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True obs = [ "base_height", "base_lin_vel", @@ -112,6 +111,7 @@ class critic: "phase_obs", "dof_pos_target", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 86a8d606..c7e061a9 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -198,6 +198,7 @@ class actor: "dof_vel", "dof_pos_history", ] + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -226,6 +227,7 @@ class critic: "dof_vel", "dof_pos_history", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..eed5d0a9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -68,6 +68,7 @@ class actor: "dof_pos", "dof_vel", ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False @@ -77,14 +78,14 @@ class noise: dof_vel = 0.0 class critic: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" obs = [ "dof_pos", "dof_vel", ] - hidden_dims = [128, 64, 32] - # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" - standard_critic_nn = True + normalize_obs = True class reward: class weights: From a79fae1d66d2f70dec2ac2af2bc28e182c2d871d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 31 May 2024 16:14:27 -0400 Subject: [PATCH 58/98] ppo2 works again on pendulum (lqr/testing config) --- .gitignore | 1 + gym/envs/base/fixed_robot.py | 8 ++-- gym/envs/pendulum/pendulum.py | 60 ++++++++++++++++++---------- gym/envs/pendulum/pendulum_config.py | 37 +++++++++-------- 4 files changed, 65 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index 0b68375a..6028e3b5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ +*smooth_exploration/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index c3cf0566..73e2fb67 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -568,11 +568,11 @@ def _get_env_origins(self): # TODO: do without terrain # ------------ reward functions---------------- - def _sqrdexp(self, x, scale=1.0): + def _sqrdexp(self, x, sigma=None): """shorthand helper for squared exponential""" - return torch.exp( - -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma - ) + if sigma is None: + sigma = self.cfg.reward_settings.tracking_sigma + return torch.exp(-torch.square(x) / sigma) def _reward_torques(self): """Penalize torques""" diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index c22c034a..c63ffa6c 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,22 +1,35 @@ +from math import sqrt import torch from gym.envs.base.fixed_robot import FixedRobot class Pendulum(FixedRobot): - def _init_buffers(self): - super()._init_buffers() - self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) - - def _post_decimation_step(self): - super()._post_decimation_step() - self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) - - def _reset_system(self, env_ids): - super()._reset_system(env_ids) - self.dof_pos_obs[env_ids] = torch.cat( - [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + def _post_physics_step(self): + """Update all states that are not handled in PhysX""" + super()._post_physics_step() + + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] @@ -30,21 +43,28 @@ def _reward_equilibrium(self): error = torch.abs(self.dof_state) error[:, 0] /= self.scales["dof_pos"] error[:, 1] /= self.scales["dof_vel"] - return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) + return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) # return torch.exp( # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma # ) + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + def _reward_energy(self): - m_pendulum = 1.0 - l_pendulum = 1.0 kinetic_energy = ( - 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + 0.5 + * self.cfg.asset.mass + * self.cfg.asset.length**2 + * torch.square(self.dof_vel[:, 0]) ) potential_energy = ( - m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + self.cfg.asset.mass + * 9.81 + * self.cfg.asset.length + * torch.cos(self.dof_pos[:, 0]) ) - desired_energy = m_pendulum * 9.81 * l_pendulum + desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length energy_error = kinetic_energy + potential_energy - desired_energy - return -(energy_error / desired_energy).pow(2) - # return self._sqrdexp(energy_error / desired_energy) + return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 49534568..739594c3 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 4096 - num_actuators = 1 - episode_length_s = 25.0 + num_envs = 2**12 + num_actuators = 1 # 1 for theta connecting base and pole + episode_length_s = 10.0 class terrain(FixedRobotCfg.terrain): pass @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -27,14 +27,14 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi / 2, torch.pi / 2], + "theta": [-torch.pi, torch.pi], } - dof_vel_range = {"theta": [-1, 1]} + dof_vel_range = {"theta": [-5, 5]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 25 - desired_sim_frequency = 200 + ctrl_frequency = 10 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -44,6 +44,8 @@ class asset(FixedRobotCfg.asset): disable_gravity = False disable_motors = False # all torques set to 0 joint_damping = 0.1 + mass = 1.0 + length = 1.0 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -52,7 +54,7 @@ class scaling(FixedRobotCfg.scaling): dof_vel = 5.0 dof_pos = 2.0 * torch.pi # * Action scales - tau_ff = 5.0 + tau_ff = 1.0 class PendulumRunnerCfg(FixedRobotCfgPPO): @@ -63,7 +65,7 @@ class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - + normalize_obs = True obs = [ "dof_pos", "dof_vel", @@ -77,6 +79,8 @@ class noise: dof_vel = 0.0 class critic: + critic_class_name = "" + normalize_obs = True obs = [ "dof_pos", "dof_vel", @@ -90,24 +94,23 @@ class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.05 + energy = 0.0 dof_vel = 0.0 - torques = 0.01 + torques = 0.025 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): # both - gamma = 0.99 - discount_horizon = 2.0 + gamma = 0.95 + # discount_horizon = 2.0 lam = 0.98 # shared max_gradient_steps = 24 # new storage_size = 2**17 # new batch_size = 2**16 # new - clip_param = 0.2 learning_rate = 1.0e-4 max_grad_norm = 1.0 @@ -121,6 +124,6 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates + max_iterations = 200 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 100 From 3f81326c970a6cbf1d7cef0c510b82f5b8da3ad1 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 31 May 2024 16:44:30 -0400 Subject: [PATCH 59/98] revert some pendulum changes, SAC runs now but doesn't learn --- gym/envs/pendulum/pendulum.py | 35 +++++++++--------------- gym/envs/pendulum/pendulum_SAC_config.py | 2 +- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index c63ffa6c..4f111d96 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,36 +1,27 @@ -from math import sqrt import torch from gym.envs.base.fixed_robot import FixedRobot class Pendulum(FixedRobot): - def _post_physics_step(self): - """Update all states that are not handled in PhysX""" - super()._post_physics_step() + def _init_buffers(self): + super()._init_buffers() + self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) + + def _post_decimation_step(self): + super()._post_decimation_step() + self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) + + def _reset_system(self, env_ids): + super()._reset_system(env_ids) + self.dof_pos_obs[env_ids] = torch.cat( + [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + ) def _check_terminations_and_timeouts(self): super()._check_terminations_and_timeouts() self.terminated = self.timed_out - def reset_to_uniform(self, env_ids): - grid_points = int(sqrt(self.num_envs)) - lin_pos = torch.linspace( - self.dof_pos_range[0, 0], - self.dof_pos_range[0, 1], - grid_points, - device=self.device, - ) - lin_vel = torch.linspace( - self.dof_vel_range[0, 0], - self.dof_vel_range[0, 1], - grid_points, - device=self.device, - ) - grid = torch.cartesian_prod(lin_pos, lin_vel) - self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) - self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) - def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index aa182e15..2ccb4eab 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -9,7 +9,7 @@ class env(PendulumCfg.env): episode_length_s = 2.5 class init_state(PendulumCfg.init_state): - reset_mode = "reset_to_basic" + reset_mode = "reset_to_range" default_joint_angles = {"theta": 0.0} dof_pos_range = { "theta": [-torch.pi / 2, torch.pi / 2], From 9278807df5afa20fe6c7553a92bfe4b8c5b4b1b3 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 5 Jun 2024 14:11:54 -0400 Subject: [PATCH 60/98] sac pendulum converges somewhat (bugfixes + new rewards) --- gym/envs/pendulum/pendulum.py | 20 +++++++--- gym/envs/pendulum/pendulum_SAC_config.py | 51 ++++++++++++------------ gym/envs/pendulum/pendulum_config.py | 2 +- learning/algorithms/sac.py | 11 ++--- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 4f111d96..84aa0b54 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,4 +1,5 @@ import torch +import numpy as np from gym.envs.base.fixed_robot import FixedRobot @@ -23,12 +24,16 @@ def _check_terminations_and_timeouts(self): self.terminated = self.timed_out def _reward_theta(self): - theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] - return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + theta_norm = self._normalize_theta() + return -theta_norm.pow(2).item() + # theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] + # return self._sqrdexp(theta_rwd.squeeze(dim=-1)) def _reward_omega(self): - omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) - return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + omega = self.dof_vel[:, 0] + return -omega.pow(2).item() + # omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + # return self._sqrdexp(omega_rwd.squeeze(dim=-1)) def _reward_equilibrium(self): error = torch.abs(self.dof_state) @@ -41,7 +46,8 @@ def _reward_equilibrium(self): def _reward_torques(self): """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + return -self.torques.pow(2).item() + # return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) def _reward_energy(self): kinetic_energy = ( @@ -59,3 +65,7 @@ def _reward_energy(self): desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length energy_error = kinetic_energy + potential_energy - desired_energy return self._sqrdexp(energy_error / desired_energy) + + def _normalize_theta(self): + theta = self.dof_pos[:, 0] + return ((theta + np.pi) % (2 * np.pi)) - np.pi diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 2ccb4eab..8b0d1f4a 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -5,20 +5,20 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): - num_envs = 256 - episode_length_s = 2.5 + num_envs = 1 + episode_length_s = 4 class init_state(PendulumCfg.init_state): reset_mode = "reset_to_range" default_joint_angles = {"theta": 0.0} dof_pos_range = { - "theta": [-torch.pi / 2, torch.pi / 2], + "theta": [-torch.pi, torch.pi], } dof_vel_range = {"theta": [-1, 1]} class control(PendulumCfg.control): - ctrl_frequency = 25 - desired_sim_frequency = 200 + ctrl_frequency = 20 + desired_sim_frequency = 100 class asset(PendulumCfg.asset): joint_damping = 0.1 @@ -38,9 +38,9 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): class actor: hidden_dims = { - "latent": [400], - "mean": [300], - "std": [300], + "latent": [128, 64], + "mean": [32], + "std": [32], } # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = { @@ -66,46 +66,47 @@ class critic: "dof_pos_obs", "dof_vel", ] - hidden_dims = [256, 256] + hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True + # TODO[lm]: Current normalization uses torch.no_grad, this should be changed + normalize_obs = False class reward: class weights: - theta = 0.0 - omega = 0.0 - equilibrium = 2.0 + theta = 1.0 + omega = 0.1 + equilibrium = 0.0 energy = 1.0 dof_vel = 0.0 - torques = 0.01 + torques = 0.001 class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 10**3 + initial_fill = 500 storage_size = 10**6 # 17 batch_size = 256 # 4096 max_gradient_steps = 1 # 10 # SB3: 1 - action_max = 1.0 - action_min = -1.0 + action_max = 2.0 + action_min = -2.0 actor_noise_std = 1.0 log_std_max = 4.0 log_std_min = -20.0 - alpha = 0.8 + alpha = 0.2 target_entropy = -1.0 max_grad_norm = 1.0 - polyak = 0.98 # flipped compared to stable-baselines3 (polyak == 1-tau) - gamma = 0.98 - alpha_lr = 3e-5 - actor_lr = 3e-5 - critic_lr = 3e-5 + polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 3e-4 + critic_lr = 3e-4 class runner(FixedRobotCfgPPO.runner): run_name = "" - experiment_name = "pendulum" - max_iterations = 10000 # number of policy updates + experiment_name = "sac_pendulum" + max_iterations = 30_000 # number of policy updates algorithm_class_name = "SAC" save_interval = 250 num_steps_per_env = 1 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 739594c3..89010ebf 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -94,7 +94,7 @@ class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.5 dof_vel = 0.0 torques = 0.025 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index f7bd7510..8c57ca4c 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -183,10 +183,10 @@ def update_critic(self, batch): # * self._sample_action(actor_next_obs) mean, std = self.actor.forward(next_actor_obs, deterministic=False) distribution = torch.distributions.Normal(mean, std) - actions = distribution.rsample() + next_actions = distribution.rsample() ## * self._scale_actions(actions, intermediate=True) - actions_normalized = torch.tanh(actions) + actions_normalized = torch.tanh(next_actions) # RSL also does a resahpe(-1, self.action_size), not sure why actions_scaled = ( actions_normalized * self.action_delta + self.action_offset @@ -196,7 +196,7 @@ def update_critic(self, batch): # 1.0 - actions_normalized.pow(2) + 1e-6 # ).sum(-1) action_logp = ( - distribution.log_prob(actions) + distribution.log_prob(next_actions) - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) ).sum(-1) @@ -253,8 +253,9 @@ def update_actor_and_alpha(self, batch): actions_normalized * self.action_delta + self.action_offset ).clamp(self.action_min, self.action_max) ## * - action_logp = distribution.log_prob(actions).sum(-1) - torch.log( - 1.0 - actions_normalized.pow(2) + 1e-6 + action_logp = ( + distribution.log_prob(actions) + - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) ).sum(-1) # * returns target_action = actions_scaled, target_action_logp = action_logp From a6d4adfda3122b745dc846b5439b889966a39fff Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 5 Jun 2024 15:24:53 -0400 Subject: [PATCH 61/98] increase LR (all pendulums balance now) --- gym/envs/pendulum/pendulum_SAC_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 8b0d1f4a..a15ae9de 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -100,13 +100,13 @@ class algorithm(FixedRobotCfgPPO.algorithm): polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.99 alpha_lr = 1e-4 - actor_lr = 3e-4 - critic_lr = 3e-4 + actor_lr = 5e-4 + critic_lr = 5e-4 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" max_iterations = 30_000 # number of policy updates algorithm_class_name = "SAC" - save_interval = 250 + save_interval = 5000 num_steps_per_env = 1 From 02611d026aa73d8c6e55d1d6458aa3de5b2c7294 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Thu, 6 Jun 2024 15:11:26 -0400 Subject: [PATCH 62/98] increase LR and episode len --- gym/envs/pendulum/pendulum_SAC_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index a15ae9de..c17494ef 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -6,7 +6,7 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): num_envs = 1 - episode_length_s = 4 + episode_length_s = 10 class init_state(PendulumCfg.init_state): reset_mode = "reset_to_range" @@ -100,13 +100,13 @@ class algorithm(FixedRobotCfgPPO.algorithm): polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) gamma = 0.99 alpha_lr = 1e-4 - actor_lr = 5e-4 - critic_lr = 5e-4 + actor_lr = 1e-3 + critic_lr = 1e-3 class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" - max_iterations = 30_000 # number of policy updates + max_iterations = 40_000 # number of policy updates algorithm_class_name = "SAC" save_interval = 5000 num_steps_per_env = 1 From dd9704e247110147cb1955bc680a6c9657823533 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 7 Jun 2024 16:42:45 -0400 Subject: [PATCH 63/98] added SAC mini cheetah and refactorings --- gym/envs/__init__.py | 7 + .../mini_cheetah/mini_cheetah_SAC_config.py | 184 ++++++++++++++++++ learning/algorithms/sac.py | 48 +---- learning/runners/off_policy_runner.py | 5 - 4 files changed, 194 insertions(+), 50 deletions(-) create mode 100644 gym/envs/mini_cheetah/mini_cheetah_SAC_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index f57f4247..f9d8b788 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -27,6 +27,7 @@ "MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahSACCfg": ".mini_cheetah.mini_cheetah_SAC_config", "MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config", "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", @@ -40,6 +41,7 @@ "MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahSACRunnerCfg": ".mini_cheetah.mini_cheetah_SAC_config", "MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config", "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", @@ -61,6 +63,11 @@ "MiniCheetahOscCfg", "MiniCheetahOscRunnerCfg", ], + "sac_mini_cheetah": [ + "MiniCheetahRef", + "MiniCheetahSACCfg", + "MiniCheetahSACRunnerCfg" + ], "humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"], "humanoid_running": [ "HumanoidRunning", diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py new file mode 100644 index 00000000..7f2b3a1d --- /dev/null +++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py @@ -0,0 +1,184 @@ +from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.mini_cheetah.mini_cheetah_ref_config import MiniCheetahRefCfg + +BASE_HEIGHT_REF = 0.3 + + +class MiniCheetahSACCfg(MiniCheetahRefCfg): + class env(MiniCheetahRefCfg.env): + num_envs = 1 + num_actuators = 12 + episode_length_s = 4 # TODO + + class terrain(MiniCheetahRefCfg.terrain): + pass + + class init_state(MiniCheetahRefCfg.init_state): + pass + + class control(MiniCheetahRefCfg.control): + # * PD Drive parameters: + stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} + damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} + gait_freq = 3.0 + ctrl_frequency = 20 # TODO + desired_sim_frequency = 100 + + class commands(MiniCheetahRefCfg.commands): + pass + + class push_robots(MiniCheetahRefCfg.push_robots): + pass + + class domain_rand(MiniCheetahRefCfg.domain_rand): + pass + + class asset(MiniCheetahRefCfg.asset): + file = ( + "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + + "mini_cheetah/urdf/mini_cheetah_simple.urdf" + ) + foot_name = "foot" + penalize_contacts_on = ["shank"] + terminate_after_contacts_on = ["base"] + end_effector_names = ["foot"] + collapse_fixed_joints = False + self_collisions = 1 + flip_visual_attachments = False + disable_gravity = False + disable_motors = False + joint_damping = 0.1 + rotor_inertia = [0.002268, 0.002268, 0.005484] * 4 + + class reward_settings(MiniCheetahRefCfg.reward_settings): + soft_dof_pos_limit = 0.9 + soft_dof_vel_limit = 0.9 + soft_torque_limit = 0.9 + max_contact_force = 600.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 + + class scaling(MiniCheetahRefCfg.scaling): + base_ang_vel = 0.3 + base_lin_vel = BASE_HEIGHT_REF + dof_vel = 4 * [2.0, 2.0, 4.0] + base_height = 0.3 + dof_pos = 4 * [0.2, 0.3, 0.3] + dof_pos_obs = dof_pos + dof_pos_target = 4 * [0.2, 0.3, 0.3] + tau_ff = 4 * [18, 18, 28] + commands = [3, 1, 3] + + +class MiniCheetahSACRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "OffPolicyRunner" + + class actor: + hidden_dims = { + "latent": [128, 128], + "mean": [64], + "std": [64], + } + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = { + "latent": "elu", + "mean": "elu", + "std": "elu", + } + + # TODO[lm]: Handle normalization + normalize_obs = False + obs = [ + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + ] + + actions = ["dof_pos_target"] + add_noise = False # TODO + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [128, 128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + # TODO[lm]: Handle normalization + normalize_obs = False + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + "dof_pos_target", + ] + + class reward: + class weights: + tracking_lin_vel = 4.0 + tracking_ang_vel = 2.0 + lin_vel_z = 0.0 + ang_vel_xy = 0.01 + orientation = 1.0 + torques = 5.0e-7 + dof_vel = 0.0 + min_base_height = 1.5 + collision = 0.0 + action_rate = 0.01 + action_rate2 = 0.001 + stand_still = 0.0 + dof_pos_limits = 0.0 + feet_contact_forces = 0.0 + dof_near_home = 0.0 + reference_traj = 1.5 + swing_grf = 1.5 + stance_grf = 1.5 + + class termination_weight: + termination = 0.15 + + class algorithm(FixedRobotCfgPPO.algorithm): + # Taken from SAC pendulum + initial_fill = 500 + storage_size = 10**6 + batch_size = 256 + max_gradient_steps = 1 # 10 + action_max = 1.0 # TODO + action_min = -1.0 # TODO + actor_noise_std = 0.5 # TODO + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -12.0 # -action_dim + max_grad_norm = 1.0 + polyak = 0.995 # flipped compared to SB3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 1e-4 + critic_lr = 1e-4 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "sac_mini_cheetah" + max_iterations = 50_000 + algorithm_class_name = "SAC" + save_interval = 10_000 + num_steps_per_env = 1 diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index 8c57ca4c..e0651d2d 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -27,26 +27,14 @@ def __init__( max_grad_norm=1.0, polyak=0.995, gamma=0.99, - # clip_param=0.2, # * PPO - # gamma=0.998, - # lam=0.95, - # entropy_coef=0.0, actor_lr=1e-4, critic_lr=1e-4, - # max_grad_norm=1.0, - # use_clipped_value_loss=True, - # schedule="fixed", - # desired_kl=0.01, device="cpu", **kwargs, ): self.device = device - # self.desired_kl = desired_kl - # self.schedule = schedule - # self.lr = lr - - # * PPO components + # * SAC components self.actor = actor.to(self.device) self.critic_1 = critic_1.to(self.device) self.critic_2 = critic_2.to(self.device) @@ -74,29 +62,14 @@ def __init__( target_entropy if target_entropy else -self.actor.num_actions ) - # * PPO parameters - # self.clip_param = clip_param - # self.batch_size = batch_size - self.max_gradient_steps = max_gradient_steps - # self.entropy_coef = entropy_coef - # self.gamma = gamma - # self.lam = lam - # self.max_grad_norm = max_grad_norm - # self.use_clipped_value_loss = use_clipped_value_loss # * SAC parameters + self.max_gradient_steps = max_gradient_steps self.batch_size = batch_size self.polyak = polyak self.gamma = gamma # self.ent_coef = "fixed" # self.target_entropy = "fixed" - self.test_input = torch.randn(256, 3, device=self.device) - self.test_actions = torch.zeros(256, 1, device=self.device) - self.test_action_mean = torch.zeros(256, device=self.device) - self.test_action_std = torch.zeros(256, device=self.device) - self.test_action_max = torch.zeros(256, device=self.device) - self.test_action_min = torch.zeros(256, device=self.device) - @property def alpha(self): return self.log_alpha.exp() @@ -163,12 +136,7 @@ def update(self, data): self.mean_alpha_loss /= count self.mean_critic_1_loss /= count self.mean_critic_2_loss /= count - with torch.inference_mode(): - self.test_actions = self.act_inference(self.test_input).cpu() - self.test_action_mean = self.test_actions.mean().item() - self.test_action_std = self.test_actions.std().item() - self.test_action_max = self.test_actions.max().item() - self.test_action_min = self.test_actions.min().item() + return None def update_critic(self, batch): @@ -192,9 +160,6 @@ def update_critic(self, batch): actions_normalized * self.action_delta + self.action_offset ).clamp(self.action_min, self.action_max) ## * - # action_logp = distribution.log_prob(actions).sum(-1) - torch.log( - # 1.0 - actions_normalized.pow(2) + 1e-6 - # ).sum(-1) action_logp = ( distribution.log_prob(next_actions) - torch.log(1.0 - actions_normalized.pow(2) + 1e-6) @@ -267,13 +232,6 @@ def update_actor_and_alpha(self, batch): self.log_alpha * (action_logp + self.target_entropy).detach() ).mean() - # alpha_loss = ( - # -self.log_alpha * (action_logp + self.target_entropy).detach() - # ).mean() - # alpha_loss = ( - # -(self.log_alpha * (action_logp + self.target_entropy)).detach().mean() - # ) - self.log_alpha_optimizer.zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 2a7bc66e..e57be9a9 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -221,11 +221,6 @@ def set_up_logger(self): "mean_actor_loss", "mean_alpha_loss", "alpha", - # "test_actions", - "test_action_mean", - "test_action_std", - "test_action_max", - "test_action_min", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) From 65eb25339402f7efadff6c661901bdba5b3a7e57 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 12 Jun 2024 10:05:59 -0400 Subject: [PATCH 64/98] PPO and SAC work on same rewards and config --- gym/envs/pendulum/pendulum.py | 27 ++++++++++-------------- gym/envs/pendulum/pendulum_SAC_config.py | 14 ++++++------ gym/envs/pendulum/pendulum_config.py | 8 +++---- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 84aa0b54..d935b7b8 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -24,30 +24,24 @@ def _check_terminations_and_timeouts(self): self.terminated = self.timed_out def _reward_theta(self): - theta_norm = self._normalize_theta() - return -theta_norm.pow(2).item() - # theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] - # return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling + return self._sqrdexp(theta_rwd.squeeze(dim=-1)) def _reward_omega(self): - omega = self.dof_vel[:, 0] - return -omega.pow(2).item() - # omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) - # return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + return self._sqrdexp(omega_rwd.squeeze(dim=-1)) def _reward_equilibrium(self): - error = torch.abs(self.dof_state) - error[:, 0] /= self.scales["dof_pos"] - error[:, 1] /= self.scales["dof_vel"] + theta_norm = self._normalize_theta() + omega = self.dof_vel[:, 0] + error = torch.stack( + [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 + ) return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) - # return torch.exp( - # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma - # ) def _reward_torques(self): """Penalize torques""" - return -self.torques.pow(2).item() - # return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) def _reward_energy(self): kinetic_energy = ( @@ -67,5 +61,6 @@ def _reward_energy(self): return self._sqrdexp(energy_error / desired_energy) def _normalize_theta(self): + # normalize to range [-pi, pi] theta = self.dof_pos[:, 0] return ((theta + np.pi) % (2 * np.pi)) - np.pi diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index c17494ef..48488583 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -14,10 +14,10 @@ class init_state(PendulumCfg.init_state): dof_pos_range = { "theta": [-torch.pi, torch.pi], } - dof_vel_range = {"theta": [-1, 1]} + dof_vel_range = {"theta": [-5, 5]} class control(PendulumCfg.control): - ctrl_frequency = 20 + ctrl_frequency = 10 desired_sim_frequency = 100 class asset(PendulumCfg.asset): @@ -74,12 +74,12 @@ class critic: class reward: class weights: - theta = 1.0 - omega = 0.1 - equilibrium = 0.0 - energy = 1.0 + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.5 dof_vel = 0.0 - torques = 0.001 + torques = 0.025 class termination_weight: termination = 0.0 diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 89010ebf..24942825 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -65,9 +65,9 @@ class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - normalize_obs = True + normalize_obs = False obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] @@ -80,9 +80,9 @@ class noise: class critic: critic_class_name = "" - normalize_obs = True + normalize_obs = False obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] hidden_dims = [128, 64, 32] From 2ccfd8f90846e2411c3e23dcd7f9900eb063673e Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 12 Jun 2024 10:38:09 -0400 Subject: [PATCH 65/98] refactorings --- gym/envs/pendulum/pendulum.py | 4 ---- gym/envs/pendulum/pendulum_config.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index d935b7b8..47d23eb5 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -19,10 +19,6 @@ def _reset_system(self, env_ids): [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 ) - def _check_terminations_and_timeouts(self): - super()._check_terminations_and_timeouts() - self.terminated = self.timed_out - def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 24942825..bc1cdea7 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**12 - num_actuators = 1 # 1 for theta connecting base and pole - episode_length_s = 10.0 + num_envs = 4096 + num_actuators = 1 + episode_length_s = 10 class terrain(FixedRobotCfg.terrain): pass @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -65,6 +65,8 @@ class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" + + # TODO[lm]: Handle normalization in SAC, then also use it here again normalize_obs = False obs = [ "dof_pos_obs", @@ -80,6 +82,8 @@ class noise: class critic: critic_class_name = "" + + # TODO[lm]: Handle normalization in SAC, then also use it here again normalize_obs = False obs = [ "dof_pos_obs", @@ -126,4 +130,4 @@ class runner(FixedRobotCfgPPO.runner): experiment_name = "pendulum" max_iterations = 200 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 100 + num_steps_per_env = 32 From 49ddec29aa12f6e37dee9259ca781f130f440211 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 14 Jun 2024 14:52:01 -0400 Subject: [PATCH 66/98] fixes --- .gitignore | 1 - gym/envs/base/fixed_robot.py | 8 ++++---- gym/envs/mini_cheetah/mini_cheetah_SAC_config.py | 9 ++++----- gym/envs/pendulum/pendulum.py | 4 ++-- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 6028e3b5..0b68375a 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,6 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ -*smooth_exploration/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 73e2fb67..c3cf0566 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -568,11 +568,11 @@ def _get_env_origins(self): # TODO: do without terrain # ------------ reward functions---------------- - def _sqrdexp(self, x, sigma=None): + def _sqrdexp(self, x, scale=1.0): """shorthand helper for squared exponential""" - if sigma is None: - sigma = self.cfg.reward_settings.tracking_sigma - return torch.exp(-torch.square(x) / sigma) + return torch.exp( + -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma + ) def _reward_torques(self): """Penalize torques""" diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py index 7f2b3a1d..69da4cf8 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py @@ -1,4 +1,4 @@ -from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.base.legged_robot_config import LeggedRobotRunnerCfg from gym.envs.mini_cheetah.mini_cheetah_ref_config import MiniCheetahRefCfg BASE_HEIGHT_REF = 0.3 @@ -7,7 +7,6 @@ class MiniCheetahSACCfg(MiniCheetahRefCfg): class env(MiniCheetahRefCfg.env): num_envs = 1 - num_actuators = 12 episode_length_s = 4 # TODO class terrain(MiniCheetahRefCfg.terrain): @@ -70,7 +69,7 @@ class scaling(MiniCheetahRefCfg.scaling): commands = [3, 1, 3] -class MiniCheetahSACRunnerCfg(FixedRobotCfgPPO): +class MiniCheetahSACRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OffPolicyRunner" @@ -155,7 +154,7 @@ class weights: class termination_weight: termination = 0.15 - class algorithm(FixedRobotCfgPPO.algorithm): + class algorithm(LeggedRobotRunnerCfg.algorithm): # Taken from SAC pendulum initial_fill = 500 storage_size = 10**6 @@ -175,7 +174,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): actor_lr = 1e-4 critic_lr = 1e-4 - class runner(FixedRobotCfgPPO.runner): + class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "sac_mini_cheetah" max_iterations = 50_000 diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 47d23eb5..39c8b331 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -33,11 +33,11 @@ def _reward_equilibrium(self): error = torch.stack( [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 ) - return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) + return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) def _reward_torques(self): """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) def _reward_energy(self): kinetic_energy = ( From 34e08927eab02bc960494da1ec7888b24330548b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 9 Jul 2024 19:35:26 -0400 Subject: [PATCH 67/98] fix shape of pendulum reward --- gym/envs/pendulum/pendulum.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 39c8b331..722b88d1 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -21,11 +21,11 @@ def _reset_system(self, env_ids): def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling - return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + return self._sqrdexp(theta_rwd) def _reward_omega(self): omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) - return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + return self._sqrdexp(omega_rwd) def _reward_equilibrium(self): theta_norm = self._normalize_theta() From 948f2af40dc49347b4ada678c2f5e1161a47990f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 10 Jul 2024 08:04:24 -0400 Subject: [PATCH 68/98] merge in smooth exploration, working, failed unit test --- .gitignore | 4 + gym/envs/base/fixed_robot_config.py | 2 +- gym/envs/base/legged_robot_config.py | 7 +- .../mini_cheetah/mini_cheetah_SAC_config.py | 4 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 5 +- .../mini_cheetah/mini_cheetah_osc_config.py | 1 + .../mini_cheetah/mini_cheetah_ref_config.py | 6 +- gym/envs/mit_humanoid/mit_humanoid_config.py | 5 +- gym/envs/pendulum/pendulum_SAC_config.py | 4 +- gym/envs/pendulum/pendulum_config.py | 12 +- learning/algorithms/ppo.py | 3 + learning/algorithms/ppo2.py | 17 ++- learning/modules/__init__.py | 3 +- learning/modules/actor.py | 21 +++ learning/modules/smooth_actor.py | 125 ++++++++++++++++++ learning/modules/utils/neural_net.py | 20 ++- learning/runners/BaseRunner.py | 9 +- learning/runners/on_policy_runner.py | 69 +++++++++- 18 files changed, 286 insertions(+), 31 deletions(-) create mode 100644 learning/modules/smooth_actor.py diff --git a/.gitignore b/.gitignore index 0b68375a..c78e6696 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,7 @@ ipython_config.py venv/ env.bak/ venv.bak/ + +# Smooth exploration +gym/smooth_exploration/data* +gym/smooth_exploration/figures* diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index a9bdef47..7a19f2e4 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -134,7 +134,7 @@ class actor: "these_need_to_be_atributes_(states)_of_the_robot_env", ] normalize_obs = True - + smooth_exploration = False actions = ["tau_ff"] disable_actions = False diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index ce95ed0a..7dd59807 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -238,6 +238,9 @@ class actor: hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + normalize_obs = True + smooth_exploration = False + obs = [ "observation_a", "observation_b", @@ -292,7 +295,7 @@ class algorithm: lam = 0.95 # shared batch_size = 2**15 - max_gradient_steps = 10 + max_gradient_steps = 24 # new storage_size = 2**17 # new batch_size = 2**15 # new @@ -306,6 +309,8 @@ class algorithm: entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 + lr_range = [2e-4, 1e-2] + lr_ratio = 1.3 class runner: policy_class_name = "ActorCritic" diff --git a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py index 69da4cf8..13348b69 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_SAC_config.py @@ -73,7 +73,7 @@ class MiniCheetahSACRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OffPolicyRunner" - class actor: + class actor(LeggedRobotRunnerCfg.actor): hidden_dims = { "latent": [128, 128], "mean": [64], @@ -111,7 +111,7 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class critic: + class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [128, 128, 64] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index ff82119f..157c6031 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -126,12 +126,11 @@ class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class actor: + class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" obs = [ - "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", @@ -154,7 +153,7 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class critic: + class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [128, 64] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index 5aaafde3..9d686f8e 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -167,6 +167,7 @@ class policy: critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False obs = [ "base_ang_vel", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index bd8140b2..89acb029 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -69,10 +69,12 @@ class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class actor: + class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False + exploration_sample_freq = 16 obs = [ "base_ang_vel", "projected_gravity", @@ -96,7 +98,7 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class critic: + class critic(MiniCheetahRunnerCfg.critic): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index c7e061a9..293270b2 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -181,12 +181,13 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class actor: + class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False obs = [ "base_height", @@ -212,7 +213,7 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 - class critic: + class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 48488583..d555d5df 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -36,7 +36,7 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OffPolicyRunner" - class actor: + class actor(FixedRobotCfgPPO.actor): hidden_dims = { "latent": [128, 64], "mean": [32], @@ -61,7 +61,7 @@ class noise: dof_pos = 0.0 dof_vel = 0.0 - class critic: + class critic(FixedRobotCfgPPO.critic): obs = [ "dof_pos_obs", "dof_vel", diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 0fb705ae..561087f6 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -33,8 +33,8 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 10 - desired_sim_frequency = 100 + ctrl_frequency = 100 + desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -61,7 +61,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class actor: + class actor(FixedRobotCfgPPO.actor): hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" @@ -94,11 +94,11 @@ class critic: class reward: class weights: - theta = 0.0 - omega = 0.0 + theta = 0.1 + omega = 0.1 equilibrium = 1.0 energy = 0.5 - dof_vel = 0.0 + dof_vel = 0.1 torques = 0.025 class termination_weight: diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index befeea79..92898ad5 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -105,6 +105,9 @@ def init_storage( self.device, ) + def test_mode(self): + self.actor_critic.test() + def train_mode(self): self.actor_critic.train() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 09c60ab9..6c64e334 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -26,6 +26,8 @@ def __init__( schedule="fixed", desired_kl=0.01, device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, **kwargs, ): self.device = device @@ -33,6 +35,8 @@ def __init__( self.desired_kl = desired_kl self.schedule = schedule self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio # * PPO components self.actor = actor.to(self.device) @@ -90,7 +94,7 @@ def update_actor(self, data): self.mean_surrogate_loss = 0 counter = 0 - self.actor.act(data["actor_obs"]) + self.actor.update_distribution(data["actor_obs"]) data["old_sigma_batch"] = self.actor.action_std.detach() data["old_mu_batch"] = self.actor.action_mean.detach() data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( @@ -103,7 +107,7 @@ def update_actor(self, data): max_gradient_steps=self.max_gradient_steps, ) for batch in generator: - self.actor.act(batch["actor_obs"]) + self.actor.update_distribution(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std @@ -123,11 +127,16 @@ def update_actor(self, data): axis=-1, ) kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) for param_group in self.optimizer.param_groups: # ! check this diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 8d4a3ab1..f608cb29 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -34,4 +34,5 @@ from .state_estimator import StateEstimatorNN from .actor import Actor from .critic import Critic -from .chimera_actor import ChimeraActor \ No newline at end of file +from .chimera_actor import ChimeraActor +from .smooth_actor import SmoothActor diff --git a/learning/modules/actor.py b/learning/modules/actor.py index fbaa6868..c2cb89c5 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,6 +5,8 @@ from .utils import export_network from .utils import RunningMeanStd +from gym import LEGGED_GYM_ROOT_DIR + class Actor(nn.Module): def __init__( @@ -25,6 +27,8 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions + self.hidden_dims = hidden_dims + self.activation = activation self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) # Action noise @@ -33,6 +37,9 @@ def __init__( # disable args validation for speedup Normal.set_default_validate_args = False + # Debug mode for plotting + self.debug = False + @property def action_mean(self): return self.distribution.mean @@ -41,6 +48,10 @@ def action_mean(self): def action_std(self): return self.distribution.stddev + @property + def get_std(self): + return self.distribution.stddev + @property def entropy(self): return self.distribution.entropy().sum(dim=-1) @@ -51,6 +62,12 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) + if self.debug: + mean = self.distribution.mean + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv" + sample = self.distribution.sample() + self.log_actions(mean[0][2], sample[0][2], path) + return sample return self.distribution.sample() def get_actions_log_prob(self, actions): @@ -67,3 +84,7 @@ def forward(self, observations): def export(self, path): export_network(self, "policy", path, self.num_obs) + + def log_actions(self, mean, sample, path): + with open(path, "a") as f: + f.write(str(mean.item()) + ", " + str(sample.item()) + "\n") diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py new file mode 100644 index 00000000..c89cff7c --- /dev/null +++ b/learning/modules/smooth_actor.py @@ -0,0 +1,125 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal + +from .actor import Actor +from .utils import create_MLP + +from gym import LEGGED_GYM_ROOT_DIR + + +# The following implementation is based on the gSDE paper. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +class SmoothActor(Actor): + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + *args, + full_std: bool = True, + use_exp_ln: bool = True, + learn_features: bool = True, + epsilon: float = 1e-6, + log_std_init: float = 0.0, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.full_std = full_std + self.use_exp_ln = use_exp_ln + self.learn_features = learn_features + self.epsilon = epsilon + self.log_std_init = log_std_init + + # Create latent NN and last layer + self.latent_net = create_MLP( + self.num_obs, self.hidden_dims[-1], self.hidden_dims[:-1], self.activation + ) + self.latent_dim = self.hidden_dims[-1] + self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + # Reduce the number of parameters if needed + if self.full_std: + log_std = torch.ones(self.latent_dim, self.num_actions) + else: + log_std = torch.ones(self.latent_dim, 1) + self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) + # Sample an exploration matrix + self.sample_weights() + self.distribution = None + + # Debug mode for plotting + self.debug = False + + def sample_weights(self, batch_size=1): + # Sample weights for the noise exploration matrix + std = self.get_std + self.weights_dist = Normal(torch.zeros_like(std), std) + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + @property + def get_std(self): + # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally + if self.use_exp_ln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(self.log_std) + + if self.full_std: + return std + assert self.latent_dim is not None + # Reduce the number of parameters: + return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std + + def update_distribution(self, observations): + if self._normalize_obs: + with torch.no_grad(): + observations = self.obs_rms(observations) + # Get latent features and compute distribution + self._latent_sde = self.latent_net(observations) + if not self.learn_features: + self._latent_sde = self._latent_sde.detach() + if self._latent_sde.dim() == 2: + variance = torch.mm(self._latent_sde**2, self.get_std**2) + elif self._latent_sde.dim() == 3: + variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2) + else: + raise ValueError("Invalid latent_sde dimension") + mean_actions = self.mean_actions_net(self._latent_sde) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + + def act(self, observations): + self.update_distribution(observations) + mean = self.distribution.mean + sample = mean + self.get_noise() + if self.debug: + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv" + self.log_actions(mean[0][2], sample[0][2], path) + return sample + + def act_inference(self, observations): + if self._normalize_obs: + with torch.no_grad(): + observations = self.obs_rms(observations) + latent_sde = self.latent_net(observations) + mean_actions = self.mean_actions_net(latent_sde) + return mean_actions + + def get_noise(self): + latent_sde = self._latent_sde + if not self.learn_features: + latent_sde = latent_sde.detach() + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device)) + return noise.squeeze(dim=1) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 5fd6ab3b..b18efe96 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,6 +1,7 @@ import torch import os import copy +import numpy as np def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): @@ -56,7 +57,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): layer_list.append(activation) -def export_network(network, network_name, path, num_inputs): +def export_network(network, network_name, path, num_inputs, latent=False): """ Thsi function traces and exports the given network module in .pt and .onnx file formats. These can be used for evaluation on other systems @@ -74,7 +75,22 @@ def export_network(network, network_name, path, num_inputs): model = copy.deepcopy(network).to("cpu") # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) + + if latent: + # Export latent model + path_latent = os.path.join(path, network_name + "_latent.onnx") + model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) + model_latent.eval() + dummy_input = torch.rand((num_inputs)) + model_traced = torch.jit.trace(model_latent, dummy_input) + torch.onnx.export(model_traced, dummy_input, path_latent) + + # Save actor std of shape (num_actions, latent_dim) + # It is important that the shape is the same as the exploration matrix + path_std = os.path.join(path, network_name + "_std.txt") + std_transposed = model.get_std.numpy().T + np.savetxt(path_std, std_transposed) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9e5ab2e3..6819f4f4 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic +from learning.modules import Actor, Critic, SmoothActor from learning.utils import remove_zero_weighted_rewards @@ -22,7 +22,12 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) alg_class = eval(self.cfg["algorithm_class_name"]) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index c1758000..0469cc7a 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -21,7 +21,7 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) - def learn(self): + def learn(self, states_to_log_dict=None): self.set_up_logger() rewards_dict = {} @@ -32,6 +32,11 @@ def learn(self): tot_iter = self.it + self.num_learning_iterations self.save() + # todo refactor into actor + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) transition.update( @@ -60,9 +65,24 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # todo refactor into actor + # * Re-sample noise matrix for smooth exploration + if ( + self.actor_cfg["smooth_exploration"] + and i % self.actor_cfg["exploration_sample_freq"] == 0 + ): + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], @@ -160,9 +180,13 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], + ) + logger.register_category( + "actor", self.alg.actor, ["action_std", "entropy", "get_std"] ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @@ -201,3 +225,42 @@ def get_inference_actions(self): def export(self, path): self.alg.actor.export(path) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + with torch.inference_mode(): + for i in range(steps): + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] From ce7ab90bd0d19d5dbe3daedc398b2456bc836a07 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 10 Jul 2024 08:41:08 -0400 Subject: [PATCH 69/98] fix export vector size --- learning/modules/utils/neural_net.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index b18efe96..a2176d53 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -75,7 +75,7 @@ def export_network(network, network_name, path, num_inputs, latent=False): model = copy.deepcopy(network).to("cpu") # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((num_inputs)) + dummy_input = torch.rand((1, num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) @@ -85,7 +85,7 @@ def export_network(network, network_name, path, num_inputs, latent=False): path_latent = os.path.join(path, network_name + "_latent.onnx") model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) model_latent.eval() - dummy_input = torch.rand((num_inputs)) + dummy_input = torch.rand((1, num_inputs)) model_traced = torch.jit.trace(model_latent, dummy_input) torch.onnx.export(model_traced, dummy_input, path_latent) From c4069baea0a0e5ebc7a45fde277281444821bbdd Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 10 Jul 2024 10:30:22 -0400 Subject: [PATCH 70/98] switch back export vector size for Robot-Software compabitibility --- learning/modules/utils/neural_net.py | 4 ++-- tests/integration_tests/test_runner_integration.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index a2176d53..b18efe96 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -75,7 +75,7 @@ def export_network(network, network_name, path, num_inputs, latent=False): model = copy.deepcopy(network).to("cpu") # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) @@ -85,7 +85,7 @@ def export_network(network, network_name, path, num_inputs, latent=False): path_latent = os.path.join(path, network_name + "_latent.onnx") model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) model_latent.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model_latent, dummy_input) torch.onnx.export(model_traced, dummy_input, path_latent) diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index f2e607c6..3310624f 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -118,7 +118,7 @@ def test_default_integration_settings(self, args): ) # compute torch output with torch.no_grad(): - test_input = runner.get_obs(runner.actor_cfg["obs"])[0:1] + test_input = runner.get_obs(runner.actor_cfg["obs"])[0] runner_out = runner.alg.actor.act_inference(test_input) loaded_out = loaded_actor.act_inference(test_input) From 9feaafba335335cfdc435092e30c6b19b5322082 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 10 Jul 2024 12:12:02 -0400 Subject: [PATCH 71/98] encapsulate smooth exploration into SmoothActor --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 ++-- learning/modules/smooth_actor.py | 11 +++++++-- learning/runners/on_policy_runner.py | 23 +------------------ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 89acb029..6593f6cc 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,7 +73,7 @@ class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = False + smooth_exploration = True exploration_sample_freq = 16 obs = [ "base_ang_vel", @@ -145,6 +145,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 500 # number of policy updates + max_iterations = 1000 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 32 # deprecate diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index c89cff7c..4f89ba26 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -23,6 +23,7 @@ def __init__( learn_features: bool = True, epsilon: float = 1e-6, log_std_init: float = 0.0, + exploration_sample_freq: int = 16, **kwargs, ): super().__init__(*args, **kwargs) @@ -45,13 +46,15 @@ def __init__( log_std = torch.ones(self.latent_dim, 1) self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) # Sample an exploration matrix - self.sample_weights() + self.update_exploration_matrices(batch_size=1) self.distribution = None # Debug mode for plotting self.debug = False + self.call_counter = 0 + self.resampling_frequenchy = exploration_sample_freq - def sample_weights(self, batch_size=1): + def update_exploration_matrices(self, batch_size=1): # Sample weights for the noise exploration matrix std = self.get_std self.weights_dist = Normal(torch.zeros_like(std), std) @@ -97,6 +100,10 @@ def update_distribution(self, observations): self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) def act(self, observations): + if self.call_counter % self.resampling_frequenchy == 0: + self.update_exploration_matrices(observations.shape[0]) + self.call_counter += 1 + self.update_distribution(observations) mean = self.distribution.mean sample = mean + self.get_noise() diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 0469cc7a..fa9ef007 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -32,11 +32,6 @@ def learn(self, states_to_log_dict=None): tot_iter = self.it + self.num_learning_iterations self.save() - # todo refactor into actor - # * Initialize smooth exploration matrices - if self.actor_cfg["smooth_exploration"]: - self.alg.actor.sample_weights(batch_size=self.env.num_envs) - # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) transition.update( @@ -75,14 +70,6 @@ def learn(self, states_to_log_dict=None): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - # todo refactor into actor - # * Re-sample noise matrix for smooth exploration - if ( - self.actor_cfg["smooth_exploration"] - and i % self.actor_cfg["exploration_sample_freq"] == 0 - ): - self.alg.actor.sample_weights(batch_size=self.env.num_envs) - actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], @@ -231,17 +218,10 @@ def sim_and_log_states(self, states_to_log_dict, it_idx): # Log states to the dict, as well as whether the env terminated. steps = states_to_log_dict["terminated"].shape[2] actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) with torch.inference_mode(): for i in range(steps): - sample_freq = self.policy_cfg["exploration_sample_freq"] - if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: - self.alg.actor_critic.actor.sample_weights( - batch_size=self.env.num_envs - ) - - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.policy_cfg["actions"], actions, @@ -253,7 +233,6 @@ def sim_and_log_states(self, states_to_log_dict, it_idx): actor_obs = self.get_noisy_obs( self.policy_cfg["actor_obs"], self.policy_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) # Log states (just for the first env) terminated = self.get_terminated()[0] From 273538e4c20799714e3acfbc85edf73c2fe364ef Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 10 Jul 2024 15:34:15 -0400 Subject: [PATCH 72/98] remove deug for actor, and get_std logging --- learning/modules/actor.py | 11 ----------- learning/modules/smooth_actor.py | 7 ------- learning/runners/on_policy_runner.py | 4 +--- 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index c2cb89c5..14104df7 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,8 +5,6 @@ from .utils import export_network from .utils import RunningMeanStd -from gym import LEGGED_GYM_ROOT_DIR - class Actor(nn.Module): def __init__( @@ -37,9 +35,6 @@ def __init__( # disable args validation for speedup Normal.set_default_validate_args = False - # Debug mode for plotting - self.debug = False - @property def action_mean(self): return self.distribution.mean @@ -62,12 +57,6 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - if self.debug: - mean = self.distribution.mean - path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv" - sample = self.distribution.sample() - self.log_actions(mean[0][2], sample[0][2], path) - return sample return self.distribution.sample() def get_actions_log_prob(self, actions): diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 4f89ba26..c02e9203 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -5,8 +5,6 @@ from .actor import Actor from .utils import create_MLP -from gym import LEGGED_GYM_ROOT_DIR - # The following implementation is based on the gSDE paper. See code: # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 @@ -49,8 +47,6 @@ def __init__( self.update_exploration_matrices(batch_size=1) self.distribution = None - # Debug mode for plotting - self.debug = False self.call_counter = 0 self.resampling_frequenchy = exploration_sample_freq @@ -107,9 +103,6 @@ def act(self, observations): self.update_distribution(observations) mean = self.distribution.mean sample = mean + self.get_noise() - if self.debug: - path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv" - self.log_actions(mean[0][2], sample[0][2], path) return sample def act_inference(self, observations): diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index fa9ef007..f601ce71 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -171,9 +171,7 @@ def set_up_logger(self): self.alg, ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) - logger.register_category( - "actor", self.alg.actor, ["action_std", "entropy", "get_std"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) From 710ee9bd39a740562cb7a265746c90e7ecfb49da Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 10 Jul 2024 20:16:08 -0400 Subject: [PATCH 73/98] default to white noise exploration --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 6593f6cc..74e60d59 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,7 +73,7 @@ class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = True + smooth_exploration = False exploration_sample_freq = 16 obs = [ "base_ang_vel", From 098989a175231ca70a814230c1adf9ca5aabbb9b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 6 Aug 2024 13:19:37 -0400 Subject: [PATCH 74/98] fix hips_forward dimensionality --- gym/envs/mit_humanoid/mit_humanoid.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 118ac170..265f0ff9 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -223,12 +223,12 @@ def _reward_walk_freq(self): def _reward_hips_forward(self): # reward hip motors for pointing forward - hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) - hip_yaw_abad -= torch.stack( + hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.cat( (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 ) - hip_yaw_abad /= torch.stack( - (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + hip_yaw_abad /= torch.cat( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]) ) return (hip_yaw_abad).pow(2).mean(dim=1) # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) From 0611bcd52ba326d8a23b929a679382a72f14b0f9 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 6 Aug 2024 13:43:10 -0400 Subject: [PATCH 75/98] lander env --- gym/envs/__init__.py | 6 +- gym/envs/mit_humanoid/lander.py | 195 +++++++++++++++ gym/envs/mit_humanoid/lander_config.py | 331 +++++++++++++++++++++++++ 3 files changed, 531 insertions(+), 1 deletion(-) create mode 100644 gym/envs/mit_humanoid/lander.py create mode 100644 gym/envs/mit_humanoid/lander_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index f9d8b788..26710fce 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,7 +19,8 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "Pendulum": ".pendulum.pendulum" + "Pendulum": ".pendulum.pendulum", + "Lander": ".mit_humanoid.lander", } config_dict = { @@ -34,6 +35,7 @@ "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", "PendulumCfg": ".pendulum.pendulum_config", "PendulumSACCfg": ".pendulum.pendulum_SAC_config", + "LanderCfg": ".mit_humanoid.lander_config", } runner_config_dict = { @@ -48,6 +50,7 @@ "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config", + "LanderRunnerCfg": ".mit_humanoid.lander_config", } task_dict = { @@ -77,6 +80,7 @@ "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"], + "lander": ["Lander", "LanderCfg", "LanderRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py new file mode 100644 index 00000000..5856efb8 --- /dev/null +++ b/gym/envs/mit_humanoid/lander.py @@ -0,0 +1,195 @@ +import torch + +from gym.envs.base.legged_robot import LeggedRobot + +# from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid +from .jacobian import _apply_coupling + + +class Lander(LeggedRobot): + def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): + super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + + def _init_buffers(self): + super()._init_buffers() + self._init_sampled_history_buffers() + + def _init_sampled_history_buffers(self): + self.sampled_history_dof_pos_target = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), + device=self.device, + ) + self.sampled_history_dof_pos = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.sampled_history_length, + device=self.device, + ) + self.sampled_history_dof_vel = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.sampled_history_length, + device=self.device, + ) + self.sampled_history_counter = torch.zeros( + self.num_envs, dtype=int, device=self.device + ) + self.sampled_history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency + ) + + def _reset_system(self, env_ids): + if len(env_ids) == 0: + return + super()._reset_system(env_ids) + + self._reset_sampled_history_buffers(env_ids) + return + + def _reset_sampled_history_buffers(self, ids): + n = self.cfg.env.sampled_history_length + self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) + + # compute_torques accounting for coupling, and filtering torques + def _compute_torques(self): + torques = _apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ) + torques = torques.clip(-self.torque_limits, self.torque_limits) + return torques + + def _post_decimation_step(self): + super()._post_decimation_step() + self._update_sampled_history_buffers() + + def _update_sampled_history_buffers(self): + self.sampled_history_counter += 1 + + ids = torch.nonzero( + self.sampled_history_counter == self.sampled_history_threshold, + as_tuple=False, + ).flatten() + + self.sampled_history_dof_pos_target[ids] = torch.roll( + self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ + ids + ] + self.sampled_history_dof_pos[ids] = torch.roll( + self.sampled_history_dof_pos[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_vel[ids] = torch.roll( + self.sampled_history_dof_vel[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] + + self.sampled_history_counter[ids] = 0 + + # --- rewards --- + + def _switch(self, mode=None): + c_vel = torch.linalg.norm(self.commands, dim=1) + switch = torch.exp( + -torch.square( + torch.max( + torch.zeros_like(c_vel), + c_vel - self.cfg.reward_settings.switch_scale, + ) + ) + / self.cfg.reward_settings.switch_scale + ) + if mode is None or mode == "stand": + return switch + elif mode == "move": + return 1 - switch + + def _reward_lin_vel_xy(self): + return torch.exp( + -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1) + ) + + def _reward_lin_vel_z(self): + # Penalize z axis base linear velocity w. squared exp + return self._sqrdexp(self.base_lin_vel[:, 2] / self.scales["base_lin_vel"]) + + def _reward_orientation(self): + # Penalize non flat base orientation + return torch.sum( + self._sqrdexp(torch.square(self.projected_gravity[:, :2])), dim=1 + ) + + def _reward_min_base_height(self): + """Squared exponential saturating at base_height target""" + error = self.base_height - self.cfg.reward_settings.base_height_target + error = torch.clamp(error, max=0, min=None).flatten() + return self._sqrdexp(error) + + def _reward_tracking_lin_vel(self): + """Tracking of linear velocity commands (xy axes)""" + # just use lin_vel? + error = self.commands[:, :2] - self.base_lin_vel[:, :2] + # * scale by (1+|cmd|): if cmd=0, no scaling. + error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2])) + return torch.mean(self._sqrdexp(error), dim=1) + + def _reward_dof_vel(self): + # Penalize dof velocities + return torch.mean( + self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1 + ) * self._switch("stand") + + def _reward_dof_near_home(self): + return self._sqrdexp( + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] + ).mean(dim=1) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1 + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return rew_vel + rew_pos - rew_base_vel * self._switch("stand") + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) + else: + return grf + + def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function + return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 + + def _reward_walk_freq(self): + # Penalize deviation from base frequency + return torch.mean( + self._sqrdexp( + (self.oscillator_freq - self.cfg.oscillator.base_frequency) + / self.cfg.oscillator.base_frequency + ), + dim=1, + ) * self._switch("move") + + def _reward_hips_forward(self): + # reward hip motors for pointing forward + hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.stack( + (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 + ) + hip_yaw_abad /= torch.stack( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + ) + return (hip_yaw_abad).pow(2).mean(dim=1) + # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py new file mode 100644 index 00000000..a4262f0d --- /dev/null +++ b/gym/envs/mit_humanoid/lander_config.py @@ -0,0 +1,331 @@ +from gym.envs.base.legged_robot_config import ( + LeggedRobotCfg, + LeggedRobotRunnerCfg, +) + +BASE_HEIGHT_REF = 0.80 + + +class LanderCfg(LeggedRobotCfg): + class env(LeggedRobotCfg.env): + num_envs = 4096 + num_actuators = 18 + episode_length_s = 5 # episode length in seconds + + sampled_history_length = 3 # n samples + sampled_history_frequency = 10 # [Hz] + + class terrain(LeggedRobotCfg.terrain): + pass + + class init_state(LeggedRobotCfg.init_state): + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position with added randomized noise. + # * "reset_to_range" = a range of joint positions and velocities. + # * "reset_to_traj" = feed in a trajectory to sample from. + reset_mode = "reset_to_range" + + default_joint_angles = { + "hip_yaw": 0.0, + "hip_abad": 0.0, + "hip_pitch": -0.667751, + "knee": 1.4087, + "ankle": -0.708876, + "shoulder_pitch": 0.0, + "shoulder_abad": 0.0, + "shoulder_yaw": 0.0, + "elbow": -1.25, + } + + # * default COM for basic initialization + pos = [0.0, 0.0, 0.6] # x,y,z [m] + rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] + lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] + ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] + + # * initialization for random range setup + + dof_pos_range = { + "hip_yaw": [-0.0, 0.0], + "hip_abad": [-0.0, 0.0], + "hip_pitch": [-0.667751, -0.667751], + "knee": [1.4087, 1.4087], + "ankle": [-0.708876, -0.708876], + "shoulder_pitch": [0.0, 0.0], + "shoulder_abad": [0.0, 0.0], + "shoulder_yaw": [0.0, 0.0], + "elbow": [0, 0], + } + dof_vel_range = { + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.1, 0.1], + "knee": [-0.1, 0.1], + "ankle": [-0.1, 0.1], + "shoulder_pitch": [0.0, 0.0], + "shoulder_abad": [0.0, 0.0], + "shoulder_yaw": [0.0, 0.0], + "elbow": [0.0, 0.0], + } + + root_pos_range = [ + [0.0, 0.0], # x + [0.0, 0.0], # y + [1.0, 1.5], # z + [-0.1, 0.1], # roll + [-0.1, 0.1], # pitch + [-0.1, 0.1], + ] # yaw + + root_vel_range = [ + [-0.75, 2.75], # x + [-0.55, 0.55], # y + [-2.5, 0.25], # z + [-0.35, 0.35], # roll + [-0.35, 0.35], # pitch + [-0.35, 0.35], # yaw + ] + + class control(LeggedRobotCfg.control): + # * PD Drive parameters: + stiffness = { + "hip_yaw": 30.0, + "hip_abad": 30.0, + "hip_pitch": 30.0, + "knee": 30.0, + "ankle": 30.0, + "shoulder_pitch": 40.0, + "shoulder_abad": 40.0, + "shoulder_yaw": 40.0, + "elbow": 50.0, + } # [N*m/rad] + damping = { + "hip_yaw": 2.0, + "hip_abad": 2.0, + "hip_pitch": 2.0, + "knee": 2.0, + "ankle": 2.0, + "shoulder_pitch": 2.0, + "shoulder_abad": 2.0, + "shoulder_yaw": 2.0, + "elbow": 1.0, + } # [N*m*s/rad] + + ctrl_frequency = 100 + desired_sim_frequency = 1000 + + # class oscillator: + # base_frequency = 3.0 # [Hz] + + class commands: + resampling_time = 10.0 # time before command are changed[s] + + class ranges: + lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.0 # max [m/s] + yaw_vel = 0.0 # max [rad/s] + + class push_robots: + toggle = True + interval_s = 1 + max_push_vel_xy = 0.5 + push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] + + class domain_rand: + randomize_friction = True + friction_range = [0.5, 1.25] + randomize_base_mass = True + added_mass_range = [-1.0, 1.0] + + class asset(LeggedRobotCfg.asset): + file = ( + "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" + ) + # foot_collisionbox_names = ["foot"] + foot_name = "foot" + penalize_contacts_on = ["arm"] + terminate_after_contacts_on = ["base"] + end_effector_names = ["hand", "foot"] # ?? + flip_visual_attachments = False + self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter + collapse_fixed_joints = False + # * see GymDofDriveModeFlags + # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) + default_dof_drive_mode = 3 + fix_base_link = False + disable_gravity = False + disable_motors = False + total_mass = 25.0 + + class reward_settings(LeggedRobotCfg.reward_settings): + soft_dof_pos_limit = 0.8 + soft_dof_vel_limit = 0.8 + soft_torque_limit = 0.8 + max_contact_force = 1500.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 + + # a smooth switch based on |cmd| (commanded velocity). + switch_scale = 0.5 + switch_threshold = 0.2 + + class scaling(LeggedRobotCfg.scaling): + base_ang_vel = 2.5 + base_lin_vel = 1.5 + commands = 1 + base_height = BASE_HEIGHT_REF + dof_pos = [ + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + ] + # # * Action scales + dof_pos_target = dof_pos + dof_vel = [ + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + dof_pos_history = 3 * dof_pos + + +class LanderRunnerCfg(LeggedRobotRunnerCfg): + seed = -1 + runner_class_name = "OnPolicyRunner" + + class actor(LeggedRobotRunnerCfg.actor): + init_noise_std = 1.0 + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + smooth_exploration = False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + # "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + # "sampled_history_dof_pos", + # "sampled_history_dof_vel", + # "sampled_history_dof_pos_target", + # "oscillator_obs", + ] + normalize_obs = True + + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + dof_pos = 0.005 + dof_vel = 0.05 + base_ang_vel = 0.025 + base_lin_vel = 0.025 + projected_gravity = 0.01 + feet_contact_state = 0.025 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + ] + normalize_obs = True + + class reward: + class weights: + # tracking_lin_vel = 0.0 + # tracking_ang_vel = 1.5 + # orientation = 1.0 + torques = 5.0e-4 + # min_base_height = 1.5 + action_rate = 1e-3 + action_rate2 = 1e-3 + lin_vel_z = 0.0 + ang_vel_xy = 0.0 + # dof_vel = 0.25 + # stand_still = 0.25 + dof_pos_limits = 0.25 + dof_near_home = 0.25 + # stance = 1.0 + # swing = 1.0 + hips_forward = 0.0 + # walk_freq = 0.0 # 2.5 + + class termination_weight: + termination = 15 + + class algorithm(LeggedRobotRunnerCfg.algorithm): + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [1e-5, 1e-2] + lr_ratio = 1.5 + + class runner(LeggedRobotRunnerCfg.runner): + policy_class_name = "ActorCritic" + algorithm_class_name = "PPO2" + num_steps_per_env = 24 + max_iterations = 1000 + run_name = "Standing" + experiment_name = "Humanoid" + save_interval = 50 From fd308656c2aeaf81e02c79c1d19c22c976ebf02b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 7 Aug 2024 14:17:00 -0400 Subject: [PATCH 76/98] unify humanoid and lander put jacobian into MIT_humanoid base environment, rather than import; importing breaks when running --original_config --- gym/envs/mit_humanoid/lander.py | 113 +++---------------------- gym/envs/mit_humanoid/lander_config.py | 36 ++++---- gym/envs/mit_humanoid/mit_humanoid.py | 99 ++++++++++------------ 3 files changed, 74 insertions(+), 174 deletions(-) diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py index 5856efb8..972eb413 100644 --- a/gym/envs/mit_humanoid/lander.py +++ b/gym/envs/mit_humanoid/lander.py @@ -1,98 +1,13 @@ import torch -from gym.envs.base.legged_robot import LeggedRobot +# from gym.envs.base.legged_robot import LeggedRobot +from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid -# from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid -from .jacobian import _apply_coupling - -class Lander(LeggedRobot): +class Lander(MIT_Humanoid): def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): super().__init__(gym, sim, cfg, sim_params, sim_device, headless) - def _init_buffers(self): - super()._init_buffers() - self._init_sampled_history_buffers() - - def _init_sampled_history_buffers(self): - self.sampled_history_dof_pos_target = torch.zeros( - (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), - device=self.device, - ) - self.sampled_history_dof_pos = torch.zeros( - self.num_envs, - self.num_dof * self.cfg.env.sampled_history_length, - device=self.device, - ) - self.sampled_history_dof_vel = torch.zeros( - self.num_envs, - self.num_dof * self.cfg.env.sampled_history_length, - device=self.device, - ) - self.sampled_history_counter = torch.zeros( - self.num_envs, dtype=int, device=self.device - ) - self.sampled_history_threshold = int( - self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency - ) - - def _reset_system(self, env_ids): - if len(env_ids) == 0: - return - super()._reset_system(env_ids) - - self._reset_sampled_history_buffers(env_ids) - return - - def _reset_sampled_history_buffers(self, ids): - n = self.cfg.env.sampled_history_length - self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) - self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) - self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) - - # compute_torques accounting for coupling, and filtering torques - def _compute_torques(self): - torques = _apply_coupling( - self.dof_pos, - self.dof_vel, - self.dof_pos_target + self.default_dof_pos, - self.dof_vel_target, - self.p_gains, - self.d_gains, - self.tau_ff, - ) - torques = torques.clip(-self.torque_limits, self.torque_limits) - return torques - - def _post_decimation_step(self): - super()._post_decimation_step() - self._update_sampled_history_buffers() - - def _update_sampled_history_buffers(self): - self.sampled_history_counter += 1 - - ids = torch.nonzero( - self.sampled_history_counter == self.sampled_history_threshold, - as_tuple=False, - ).flatten() - - self.sampled_history_dof_pos_target[ids] = torch.roll( - self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 - ) # check - self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ - ids - ] - self.sampled_history_dof_pos[ids] = torch.roll( - self.sampled_history_dof_pos[ids], self.num_dof, dims=1 - ) # check - self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] - self.sampled_history_dof_vel[ids] = torch.roll( - self.sampled_history_dof_vel[ids], self.num_dof, dims=1 - ) # check - self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] - - self.sampled_history_counter[ids] = 0 - # --- rewards --- def _switch(self, mode=None): @@ -172,24 +87,18 @@ def _compute_grf(self, grf_norm=True): def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 - def _reward_walk_freq(self): - # Penalize deviation from base frequency - return torch.mean( - self._sqrdexp( - (self.oscillator_freq - self.cfg.oscillator.base_frequency) - / self.cfg.oscillator.base_frequency - ), - dim=1, - ) * self._switch("move") - def _reward_hips_forward(self): # reward hip motors for pointing forward - hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) - hip_yaw_abad -= torch.stack( + hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.cat( (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 ) - hip_yaw_abad /= torch.stack( - (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + hip_yaw_abad /= torch.cat( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]) ) return (hip_yaw_abad).pow(2).mean(dim=1) # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) + + def _reward_power(self): + power = self.torques * self.dof_vel + return power.pow(2).mean(dim=1) diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index a4262f0d..0de34b4c 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -10,7 +10,7 @@ class LanderCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 num_actuators = 18 - episode_length_s = 5 # episode length in seconds + episode_length_s = 10 # episode length in seconds sampled_history_length = 3 # n samples sampled_history_frequency = 10 # [Hz] @@ -71,7 +71,7 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [1.0, 1.5], # z + [0.64, 1.5], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], @@ -144,7 +144,7 @@ class asset(LeggedRobotCfg.asset): ) # foot_collisionbox_names = ["foot"] foot_name = "foot" - penalize_contacts_on = ["arm"] + penalize_contacts_on = ["arm", "hand", "shoulder"] terminate_after_contacts_on = ["base"] end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False @@ -228,7 +228,7 @@ class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" smooth_exploration = False obs = [ @@ -236,16 +236,15 @@ class actor(LeggedRobotRunnerCfg.actor): "base_lin_vel", "base_ang_vel", "projected_gravity", - # "commands", + "commands", "dof_pos_obs", "dof_vel", "dof_pos_history", - # "sampled_history_dof_pos", - # "sampled_history_dof_vel", - # "sampled_history_dof_pos_target", - # "oscillator_obs", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -261,10 +260,10 @@ class noise: class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" obs = [ - "base_height", + # "base_height", "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -272,21 +271,25 @@ class critic(LeggedRobotRunnerCfg.critic): "dof_pos_obs", "dof_vel", "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: # tracking_lin_vel = 0.0 # tracking_ang_vel = 1.5 # orientation = 1.0 - torques = 5.0e-4 - # min_base_height = 1.5 + torques = 5.0e-5 + power = 0 # 1.0e-2 + min_base_height = 1.5 action_rate = 1e-3 action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 - # dof_vel = 0.25 + dof_vel = 0.5 # stand_still = 0.25 dof_pos_limits = 0.25 dof_near_home = 0.25 @@ -294,6 +297,7 @@ class weights: # swing = 1.0 hips_forward = 0.0 # walk_freq = 0.0 # 2.5 + collision = 1.0 class termination_weight: termination = 15 diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 265f0ff9..a580fd57 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -1,8 +1,6 @@ import torch from gym.envs.base.legged_robot import LeggedRobot -from .jacobian import _apply_coupling -from gym.utils import exp_avg_filter class MIT_Humanoid(LeggedRobot): @@ -11,9 +9,6 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() - self.oscillators = torch.zeros(self.num_envs, 2, device=self.device) - self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) - self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) self._init_sampled_history_buffers() def _init_sampled_history_buffers(self): @@ -38,22 +33,51 @@ def _init_sampled_history_buffers(self): self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency ) + def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): + # Create a Jacobian matrix and move it to the same device as input tensors + J = torch.eye(q.shape[-1]).to(q.device) + J[4, 3] = 1 + J[9, 8] = 1 + + # Perform transformations using Jacobian + q = torch.matmul(q, J.T) + qd = torch.matmul(qd, J.T) + q_des = torch.matmul(q_des, J.T) + qd_des = torch.matmul(qd_des, J.T) + + # Inverse of the transpose of Jacobian + J_inv_T = torch.inverse(J.T) + + # Compute feed-forward torques + tau_ff = torch.matmul(J_inv_T, tau_ff.T).T + + # Compute kp and kd terms + kp = torch.diagonal( + torch.matmul( + torch.matmul(J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), J_inv_T.T + ), + dim1=-2, + dim2=-1, + ) + + kd = torch.diagonal( + torch.matmul( + torch.matmul(J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), J_inv_T.T + ), + dim1=-2, + dim2=-1, + ) + + # Compute torques + torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff + torques = torch.matmul(torques, J) + + return torques + def _reset_system(self, env_ids): if len(env_ids) == 0: return super()._reset_system(env_ids) - # reset oscillators, with a pi phase shift between left and right - self.oscillators[env_ids, 0] = ( - torch.rand(len(env_ids), device=self.device) * 2 * torch.pi - ) - self.oscillators[env_ids, 1] = self.oscillators[env_ids, 0] + torch.pi - self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) - # reset oscillator velocities to base freq - self.oscillator_freq[env_ids] = self.cfg.oscillator.base_frequency - # recompute oscillator observations - self.oscillator_obs = torch.cat( - (self.oscillators.cos(), self.oscillators.sin()), dim=1 - ) self._reset_sampled_history_buffers(env_ids) return @@ -65,7 +89,7 @@ def _reset_sampled_history_buffers(self, ids): # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): - torques = _apply_coupling( + torques = self._apply_coupling( self.dof_pos, self.dof_vel, self.dof_pos_target + self.default_dof_pos, @@ -74,25 +98,12 @@ def _compute_torques(self): self.d_gains, self.tau_ff, ) - torques = torques.clip(-self.torque_limits, self.torque_limits) - return exp_avg_filter(torques, self.torques, self.cfg.control.filter_gain) - - # oscillator integration + return torques.clip(-self.torque_limits, self.torque_limits) def _post_decimation_step(self): super()._post_decimation_step() - self._step_oscillators() self._update_sampled_history_buffers() - def _step_oscillators(self, dt=None): - if dt is None: - dt = self.dt - self.oscillators += (self.oscillator_freq * 2 * torch.pi) * dt - self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) - self.oscillator_obs = torch.cat( - (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 - ) - def _update_sampled_history_buffers(self): self.sampled_history_counter += 1 @@ -187,20 +198,6 @@ def _reward_stand_still(self): rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) return rew_vel + rew_pos - rew_base_vel * self._switch("stand") - def _reward_stance(self): - # phase = torch.maximum( - # torch.zeros_like(self.oscillators), -self.oscillators.sin() - # ) # positive during swing, negative during stance - phase = self.smooth_sqr_wave(self.oscillators) - return (phase * self._compute_grf()).mean(dim=1) - - def _reward_swing(self): - # phase = torch.maximum( - # torch.zeros_like(self.oscillators), self.oscillators.sin() - # ) # positive during swing, negative during stance - phase = self.smooth_sqr_wave(self.oscillators + torch.pi) - return -(phase * self._compute_grf()).mean(dim=1) - def _compute_grf(self, grf_norm=True): grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) if grf_norm: @@ -211,16 +208,6 @@ def _compute_grf(self, grf_norm=True): def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 - def _reward_walk_freq(self): - # Penalize deviation from base frequency - return torch.mean( - self._sqrdexp( - (self.oscillator_freq - self.cfg.oscillator.base_frequency) - / self.cfg.oscillator.base_frequency - ), - dim=1, - ) * self._switch("move") - def _reward_hips_forward(self): # reward hip motors for pointing forward hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) From 6248559e8229abc2daba833966dfaa9e1786a5b7 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 27 Aug 2024 19:05:11 -0400 Subject: [PATCH 77/98] everything integrated and works --- gym/envs/__init__.py | 2 +- gym/envs/base/fixed_robot.py | 21 +- gym/envs/base/fixed_robot_config.py | 2 +- gym/envs/base/legged_robot.py | 9 +- gym/envs/base/legged_robot_config.py | 10 +- gym/envs/base/task_skeleton.py | 1 + gym/envs/cartpole/cartpole_config.py | 22 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 3 +- .../mini_cheetah/mini_cheetah_osc_config.py | 71 +- .../mini_cheetah/mini_cheetah_ref_config.py | 12 +- gym/envs/pendulum/pendulum.py | 43 +- gym/envs/pendulum/pendulum_config.py | 2 +- gym/utils/helpers.py | 2 +- learning/__init__.py | 6 + learning/algorithms/__init__.py | 2 +- learning/modules/QRCritics.py | 849 ++++++++++++++++++ learning/modules/__init__.py | 1 + learning/modules/critic.py | 3 +- learning/modules/utils/neural_net.py | 91 +- learning/runners/BaseRunner.py | 3 + learning/runners/__init__.py | 4 +- learning/runners/custom_critic_runner.py | 178 ++++ learning/runners/datalogging_runner.py | 213 +++++ learning/runners/my_runner.py | 55 +- learning/runners/on_policy_runner.py | 5 +- learning/utils/dict_utils.py | 43 +- requirements.txt | 3 +- 27 files changed, 1503 insertions(+), 153 deletions(-) create mode 100644 learning/modules/QRCritics.py create mode 100644 learning/runners/custom_critic_runner.py create mode 100644 learning/runners/datalogging_runner.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index f9d8b788..1b78a261 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,7 +19,7 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "Pendulum": ".pendulum.pendulum" + "Pendulum": ".pendulum.pendulum", } config_dict = { diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index c3cf0566..544eba52 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -79,7 +79,7 @@ def _step_physx_sim(self): self.num_envs, device=self.device ) self.gym.set_dof_actuation_force_tensor( - self.sim, gymtorch.unwrap_tensor(self.torques) + self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor) ) self.gym.simulate(self.sim) if self.device == "cpu": @@ -184,10 +184,7 @@ def _process_dof_props(self, props, env_id): try: # ! this seems like a crime... self.torque_limits[i] = props["effort"][i].item() except: - print( - "WARNING: passive joints need to be listed after " - + "active joints in the URDF." - ) + print("WARNING: your system has unactuated joints") # * soft limits m = (self.dof_pos_limits[i, 0] + self.dof_pos_limits[i, 1]) / 2 r = self.dof_pos_limits[i, 1] - self.dof_pos_limits[i, 0] @@ -399,10 +396,7 @@ def _init_buffers(self): self.default_act_pos = self.default_act_pos.unsqueeze(0) # * store indices of actuated joints self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) - # * check that init range highs and lows are consistent - # * and repopulate to match - if self.cfg.init_state.reset_mode == "reset_to_range": - self.initialize_ranges_for_initial_conditions() + self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): self.dof_pos_range = torch.zeros( @@ -568,11 +562,12 @@ def _get_env_origins(self): # TODO: do without terrain # ------------ reward functions---------------- - def _sqrdexp(self, x, scale=1.0): + def _sqrdexp(self, x, sigma=None): """shorthand helper for squared exponential""" - return torch.exp( - -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma - ) + if sigma is None: + return torch.exp(-torch.square(x) / self.cfg.reward_settings.tracking_sigma) + else: + return torch.exp(-torch.square(x) / sigma) def _reward_torques(self): """Penalize torques""" diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 7a19f2e4..60160b9c 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -169,7 +169,7 @@ class algorithm: lam = 0.95 # shared batch_size = 2**15 - max_gradient_steps = 10 + max_gradient_steps = 24 # new storage_size = 2**17 # new batch_size = 2**15 # new diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index e02507d5..9923977d 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -980,11 +980,12 @@ def _get_heights(self, env_ids=None): return heights.view(self.num_envs, -1) * self.terrain.cfg.vertical_scale - def _sqrdexp(self, x, scale=1.0): + def _sqrdexp(self, x, sigma=None): """shorthand helper for squared exponential""" - return torch.exp( - -torch.square(x / scale) / self.cfg.reward_settings.tracking_sigma - ) + if sigma is None: + return torch.exp(-torch.square(x) / self.cfg.reward_settings.tracking_sigma) + else: + return torch.exp(-torch.square(x) / sigma) # ------------ reward functions---------------- diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7dd59807..0b9cf7a4 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -246,7 +246,11 @@ class actor: "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - normalize_obs = True + critic_obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] actions = ["q_des"] disable_actions = False @@ -309,8 +313,8 @@ class algorithm: entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed desired_kl = 0.01 - lr_range = [2e-4, 1e-2] - lr_ratio = 1.3 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner: policy_class_name = "ActorCritic" diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index 6a1b352c..0974970a 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -45,6 +45,7 @@ def reset(self): """Reset all robots""" self._reset_idx(torch.arange(self.num_envs, device=self.device)) self.step() + self.episode_length_buf[:] = 0 def _reset_buffers(self): self.to_be_reset[:] = False diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index 6e073d5c..c3489021 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -7,7 +7,7 @@ class CartpoleCfg(FixedRobotCfg): class env(FixedRobotCfg.env): num_envs = 4024 num_actuators = 1 # 1 for the cart force - episode_length_s = 20 + episode_length_s = 5.0 class terrain(FixedRobotCfg.terrain): pass @@ -35,7 +35,8 @@ class control(FixedRobotCfg.control): 1, # slider_to_cart 0, ] # cart_to_pole - + stiffness = {"slider_to_cart": 0.0} + damping = {"slider_to_cart": 0.0} ctrl_frequency = 250 desired_sim_frequency = 500 @@ -119,25 +120,12 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-3 - schedule = "adaptive" # could be adaptive, fixed - discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 0.2 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(FixedRobotCfgPPO.runner): policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" - num_steps_per_env = 96 # per iteration + num_steps_per_env = 32 # per iteration max_iterations = 500 # number of policy updates # * logging diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 157c6031..ba2655f7 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -138,7 +138,7 @@ class actor(LeggedRobotRunnerCfg.actor): "dof_vel", "dof_pos_target", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] add_noise = True disable_actions = False @@ -167,7 +167,6 @@ class critic(LeggedRobotRunnerCfg.critic): "dof_vel", "dof_pos_target", ] - normalize_obs = True class reward: class weights: diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index 9d686f8e..b16bf919 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -69,27 +69,28 @@ class control(MiniCheetahCfg.control): class osc: process_noise_std = 0.25 + grf_threshold = 0.1 # 20. # Normalized to body weight # oscillator parameters - omega = 3 # 0.5 #3.5 # in Hz - coupling = 1 # 0.02 - osc_bool = False - grf_bool = False + omega = 3 # gets overwritten + coupling = 1 # gets overwritten + osc_bool = False # not used in paper + grf_bool = False # not used in paper randomize_osc_params = False - grf_threshold = 0.1 # 20. # Normalized to body weight + # randomization ranges: not used in paper omega_range = [1.0, 4.0] # [0.0, 10.] coupling_range = [ 0.0, 1.0, ] # with normalized grf, can have omega/coupling on same scale offset_range = [0.0, 0.0] + + # choice of oscillator parameters, see paper equation (5) stop_threshold = 0.5 omega_stop = 1.0 omega_step = 2.0 omega_slope = 1.0 omega_max = 4.0 omega_var = 0.25 - # coupling_step = 0. - # coupling_stop = 0. coupling_stop = 4.0 coupling_step = 1.0 coupling_slope = 0.0 @@ -160,11 +161,10 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "MyRunner" - class policy: + class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" smooth_exploration = False @@ -178,8 +178,25 @@ class policy: "oscillator_obs", "dof_pos_target", ] + actions = ["dof_pos_target"] + normalize_obs = False - critic_obs = [ + class noise(MiniCheetahRunnerCfg.actor.noise): + pass + + class critic(MiniCheetahRunnerCfg.critic): + hidden_dims = [128, 128] + activation = "relu" + normalize_obs = False + + critic_class_name = "Critic" # "DenseSpectralLatent" + # * some class-specific params + # minimize = False + # relative_dim = 8 # 16 + # latent_dim = 32 # 18, + # latent_hidden_dims = [128, 128] + # latent_activation = "relu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -188,14 +205,9 @@ class policy: "dof_pos_obs", "dof_vel", "oscillator_obs", - "oscillators_vel", "dof_pos_target", ] - - actions = ["dof_pos_target"] - - class noise: - pass + normalize_obs = False class reward: class weights: @@ -227,20 +239,23 @@ class termination_weight: termination = 15.0 / 100.0 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.98 + # new + storage_size = 2**17 + batch_size = 2**15 + max_grad_steps = 24 + clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 8 learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 - GAE_bootstrap_horizon = 2.0 - desired_kl = 0.01 max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 class runner(MiniCheetahRunnerCfg.runner): run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 74e60d59..e9af92ac 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -72,9 +72,9 @@ class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" smooth_exploration = False exploration_sample_freq = 16 + activation = ["elu", "elu", "tanh"] obs = [ "base_ang_vel", "projected_gravity", @@ -83,7 +83,7 @@ class actor(MiniCheetahRunnerCfg.actor): "dof_vel", "phase_obs", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -98,10 +98,10 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class critic(MiniCheetahRunnerCfg.critic): - hidden_dims = [256, 256, 128] + class critic: + hidden_dims = [256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = ["elu", "tanh"] obs = [ "base_height", "base_lin_vel", @@ -113,7 +113,7 @@ class critic(MiniCheetahRunnerCfg.critic): "phase_obs", "dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 722b88d1..a3bbc355 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,3 +1,4 @@ +from math import sqrt import torch import numpy as np @@ -19,6 +20,28 @@ def _reset_system(self, env_ids): [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 ) + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, + ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) + def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling return self._sqrdexp(theta_rwd) @@ -28,16 +51,22 @@ def _reward_omega(self): return self._sqrdexp(omega_rwd) def _reward_equilibrium(self): - theta_norm = self._normalize_theta() - omega = self.dof_vel[:, 0] - error = torch.stack( - [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 - ) - return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) + # theta_norm = self._normalize_theta() + # omega = self.dof_vel[:, 0] + # error = torch.stack( + # [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 + # ) + # return self._sqrdexp(torch.mean(error, dim=1), 0.01) + + # todo compare alternatives + error = torch.abs(self.dof_state) + error[:, 0] /= self.scales["dof_pos"] + error[:, 1] /= self.scales["dof_vel"] + return self._sqrdexp(torch.mean(error, dim=1), sigma=0.01) def _reward_torques(self): """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) def _reward_energy(self): kinetic_energy = ( diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 561087f6..bb33c1f1 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -33,7 +33,7 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 + ctrl_frequency = 25 desired_sim_frequency = 200 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 90af8bce..1e421dac 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -191,7 +191,7 @@ def get_args(custom_parameters=None): { "name": "--task", "type": str, - "default": "cartpole", + "default": "pendulum", "help": "Resume training or start testing from a checkpoint. " "Overrides config file if provided.", }, diff --git a/learning/__init__.py b/learning/__init__.py index 34f15bd6..15c19314 100644 --- a/learning/__init__.py +++ b/learning/__init__.py @@ -29,3 +29,9 @@ # THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2021 ETH Zurich, Nikita Rudin + +import os + +LEGGED_GYM_ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.realpath(__file__)) +) diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index 78231181..7211a720 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -33,4 +33,4 @@ from .ppo import PPO from .ppo2 import PPO2 from .SE import StateEstimator -from .sac import SAC \ No newline at end of file +from .sac import SAC diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py new file mode 100644 index 00000000..a65ec304 --- /dev/null +++ b/learning/modules/QRCritics.py @@ -0,0 +1,849 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from learning.modules.utils import RunningMeanStd, create_MLP + + +def create_lower_diagonal(x, n, device): + L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + +def create_PD_lower_diagonal(x, n, device): + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + diag_indices = (tril_indices[0] == tril_indices[1]).nonzero(as_tuple=True)[0] + x[..., diag_indices] = F.softplus(x[..., diag_indices]) + L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + +def compose_cholesky(L): + return torch.einsum("...ij, ...jk -> ...ik", L, L.transpose(-2, -1)) + + +def quadratify_xAx(x, A): + return torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ).squeeze() + + +def init_weights(m): + if isinstance(m, nn.Linear): + torch.nn.init.xavier_uniform_(m.weight) + m.bias.data.fill_(0.01) + + +def least_squares_fit(x, y): + x_flat = x.view(-1, x.shape[-1]) + y_flat = y.view(-1, y.shape[-1]) + ones = torch.ones(x_flat.shape[0], 1, device=x.device) + x_aug = torch.cat([ones, x_flat], dim=1) + # Use torch.linalg.lstsq to find the least-squares solution + result = torch.linalg.lstsq(x_aug, y_flat) + coefficients = result.solution + bias = coefficients[0] + weights = coefficients[1:] + + return weights, bias + + +def forward_affine(x, weights, bias): + x_flat = x.view(-1, x.shape[-1]) + x_aug = torch.cat( + [torch.ones(x_flat.shape[0], 1, device=x_flat.device), x_flat], dim=1 + ) + return x_aug.matmul(torch.cat([bias.unsqueeze(0), weights], dim=0)) + + +class Critic(nn.Module): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + device="cuda", + **kwargs, + ): + super().__init__() + + self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + + def forward(self, x, return_all=False): + if self._normalize_obs: + with torch.no_grad(): + x = self.obs_rms(x) + value = self.NN(x).squeeze() + self.value_offset + if return_all: + return {"value": value} + else: + return value + + def evaluate(self, critic_observations, return_all=False): + return self.forward(critic_observations, return_all=return_all) + + def loss_fn(self, obs, target, **kwargs): + return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") + + +class OuterProduct(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=None, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.NN = create_MLP(num_obs, latent_dim, hidden_dims, activation, dropouts) + # todo: have a linear NN to automatically coord-shift input + + def forward(self, x, return_all=False): + z = self.NN(x) + # outer product. Both of these are equivalent + A = z.unsqueeze(-1) @ z.unsqueeze(-2) + # A2 = torch.einsum("nmx,nmy->nmxy", z, z) + # value = self.sign * quadratify_xAx(x, A) + value = quadratify_xAx(x, A) + value *= 1.0 if self.minimize else -1.0 + value += self.value_offset + + if return_all: + # return value, A, L + return {"value": value, "A": A} + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + return loss_NN + + +class OuterProductLatent(OuterProduct): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=None, + normalize_obs=False, + minimize=False, + latent_hidden_dims=[128, 128], + latent_activation="tanh", + device="cuda", + **kwargs, + ): + super().__init__( + num_obs, + hidden_dims, + latent_dim, + activation, + dropouts, + normalize_obs, + minimize, + device, + **kwargs, + ) + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + def forward(self, x, return_all=False): + z = self.NN(x) + # outer product. Both of these are equivalent + A = z.unsqueeze(-1) @ z.unsqueeze(-2) + # A2 = torch.einsum("nmx,nmy->nmxy", z, z) + value = self.sign * quadratify_xAx(self.latent_NN(x), A) + value += self.value_offset + + if return_all: + # return value, A, L + return {"value": value, "A": A, "z": z} + else: + return value + + +class CholeskyInput(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=None, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # V = x'Ax + b = a* x'Ax + b, and add a regularization for det(A) ~= 1 + # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + num_lower_diag_elements = sum(range(latent_dim + 1)) + self.lower_diag_NN = create_MLP( + num_obs, num_lower_diag_elements, hidden_dims, activation, dropouts + ) + # self.lower_diag_NN.apply(init_weights) + + def forward(self, x, return_all=False): + output = self.lower_diag_NN(x) + L = create_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + value = self.sign * quadratify_xAx(x, A) + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L} + else: + return value + # return self.sign * quadratify_xAx(x, A) + self.value_offset + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + return F.mse_loss(self.forward(obs), target, reduction="mean") + + +class CholeskyLatent(CholeskyInput): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim, + latent_hidden_dims=[64], + latent_activation="tanh", + latent_dropouts=None, + activation="elu", + dropouts=None, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__( + num_obs, + hidden_dims, + latent_dim=latent_dim, + activation=activation, + dropouts=dropouts, + normalize_obs=normalize_obs, + minimize=minimize, + device="cuda", + **kwargs, + ) + + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + # self.latent_NN.apply(init_weights) + + def forward(self, x, return_all=False): + z = self.latent_NN(x) + output = self.lower_diag_NN(x) + L = create_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + + value = self.sign * quadratify_xAx(z, A) + # do not affect value offset in this part of the loss + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L, z + return {"value": value, "A": A, "L": L, "z": z} + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + +class PDCholeskyInput(CholeskyInput): + def forward(self, x, return_all=False): + output = self.lower_diag_NN(x) + L = create_PD_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + # assert (torch.linalg.eigvals(A).real > 0).all() + # value = self.sign * quadratify_xAx(x, A) + value = quadratify_xAx(x, A) + value *= 1.0 if self.minimize else -1.0 + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L} + else: + return value + # return self.sign * quadratify_xAx(x, A) + self.value_offset + + +class PDCholeskyLatent(CholeskyLatent): + def forward(self, x, return_all=False): + z = self.latent_NN(x) + output = self.lower_diag_NN(x) + L = create_PD_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(L) + # assert (torch.linalg.eigvals(A).real > 0).all() + value = self.sign * quadratify_xAx(z, A) + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L} + else: + return value + + +class SpectralLatent(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + activation="elu", + dropouts=None, + normalize_obs=False, + relative_dim=None, + latent_dim=None, + latent_hidden_dims=[64], + latent_activation="tanh", + latent_dropouts=None, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if relative_dim is None: + relative_dim = num_obs + if latent_dim is None: + latent_dim = num_obs + + self.num_obs = num_obs + self.relative_dim = relative_dim + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + # TODO up to here, same as Cholesky, can be refactored + n_outputs = ( + torch.tril_indices(self.relative_dim, self.latent_dim).shape[1] + + relative_dim + ) + self.spectral_NN = create_MLP( + num_obs, n_outputs, hidden_dims, activation, dropouts + ) + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + def forward(self, x, return_all=False): + z = self.latent_NN(x) + y = self.spectral_NN(x) + A_diag = torch.diag_embed(F.softplus(y[..., : self.relative_dim])) + # tril_indices = torch.tril_indices(self.latent_dim, self.relative_dim) + tril_indices = torch.tril_indices(self.relative_dim, self.latent_dim) + L = torch.zeros( + (*x.shape[:-1], self.relative_dim, self.latent_dim), device=self.device + ) + L[..., tril_indices[0], tril_indices[1]] = y[..., self.relative_dim :] + # Compute (L^T A_diag) L + A = torch.einsum( + "...ij,...jk->...ik", + torch.einsum("...ik,...kj->...ij", L.transpose(-1, -2), A_diag), + L, + ) + # assert (torch.linalg.eigvals(A).real >= -1e-6).all() + # ! This fails, but so far with just -e-10, so really really small... + value = self.sign * quadratify_xAx(z, A) + # with torch.no_grad(): + value += self.value_offset + + if return_all: + # return value, z, torch.diag(A_diag), L, A + return { + "value": value, + "z": z, + # "A_diag": torch.diag(A_diag), #! throws shape error + "L": L, + "A": A, + } + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + # if self.minimize: + # loss_offset = (self.value_offset / target.min() - 1.0).pow(2) + # else: + # loss_offset = (self.value_offset / target.max() - 1.0).pow(2) + + # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2) + + return loss_NN # + loss_offset # + loss_scaling + + +class DenseSpectralLatent(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + activation="elu", + dropouts=None, + normalize_obs=False, + relative_dim=None, + latent_dim=None, + latent_hidden_dims=[64], + latent_activation="tanh", + latent_dropouts=None, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if relative_dim is None: + relative_dim = num_obs + if latent_dim is None: + latent_dim = num_obs + + self.num_obs = num_obs + self.relative_dim = relative_dim + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + # TODO up to here, same as Cholesky, can be refactored + n_zeros = sum(range(relative_dim)) + n_outputs = relative_dim + relative_dim * latent_dim - n_zeros + self.rel_fill = n_zeros + # n_outputs = ( + # relative_dim * latent_dim + # - torch.tril_indices(self.relative_dim, self.latent_dim).shape[1] + # + relative_dim + # ) + self.spectral_NN = create_MLP( + num_obs, n_outputs, hidden_dims, activation, dropouts + ) + self.latent_NN = create_MLP( + num_obs, latent_dim, latent_hidden_dims, latent_activation + ) + + def forward(self, x, return_all=False): + z = self.latent_NN(x) + y = self.spectral_NN(x) + A_diag = torch.diag_embed(F.softplus(y[..., : self.relative_dim])) + # tril_indices = torch.tril_indices(self.latent_dim, self.relative_dim) + tril_indices = torch.tril_indices(self.relative_dim, self.latent_dim) + L = torch.zeros( + (*x.shape[:-1], self.relative_dim, self.latent_dim), device=self.device + ) + L[..., tril_indices[0], tril_indices[1]] = y[ + ..., self.relative_dim : self.relative_dim + tril_indices.shape[1] + ] + L[..., :, self.relative_dim :] = y[ + ..., self.relative_dim + tril_indices.shape[1] : + ].view(L[..., :, self.relative_dim :].shape) + + # Compute (L^T A_diag) L + A = torch.einsum( + "...ij,...jk->...ik", + torch.einsum("...ik,...kj->...ij", L.transpose(-1, -2), A_diag), + L, + ) + # A = torch.einsum( + # "...ij,...jk->...ik", + # torch.einsum("...ik,...kj->...ij", L, A_diag), + # L.transpose(-1, -2), + # ) + # assert (torch.linalg.eigvals(A).real >= -1e-6).all() + # ! This fails, but so far with just -e-10, so really really small... + value = self.sign * quadratify_xAx(z, A) + # with torch.no_grad(): + value += self.value_offset + + if return_all: + # return value, z, torch.diag(A_diag), L, A + return { + "value": value, + "z": z, + # "A_diag": torch.diag(A_diag), #! throws shape error + "L": L, + "A": A, + } + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + # if self.minimize: + # loss_offset = (self.value_offset / target.min() - 1.0).pow(2) + # else: + # loss_offset = (self.value_offset / target.max() - 1.0).pow(2) + + # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2) + + return loss_NN # + loss_offset # + loss_scaling + + +class QR(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + action_dim, + latent_dim=None, + activation="elu", + dropouts=None, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.action_dim = action_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + # Make Q estimator + num_lower_diag_elements_Q = sum(range(self.latent_dim + 1)) + self.lower_diag_NN_Q = create_MLP( + latent_dim, num_lower_diag_elements_Q, hidden_dims, activation, dropouts + ) + # self.lower_diag_NN_Q.apply(init_weights) + + # Make R estimator + num_lower_diag_elements_R = sum(range(action_dim + 1)) + self.lower_diag_NN_R = create_MLP( + latent_dim, num_lower_diag_elements_R, hidden_dims, activation, dropouts + ) + # self.lower_diag_NN_R.apply(init_weights) + + def forward(self, z, u, return_all=False): + Q_lower_diag = self.lower_diag_NN_Q(z) + L_Q = create_lower_diagonal(Q_lower_diag, self.latent_dim, self.device) + Q = compose_cholesky(L_Q) + + R_lower_diag = self.lower_diag_NN_R(z) + L_R = create_lower_diagonal(R_lower_diag, self.action_dim, self.device) + R = compose_cholesky(L_R) + + if return_all: + # return Q, L_Q, R, L_R + return {"Q": Q, "L_Q": L_Q, "R": R, "L_R": L_R} + else: + return Q, R + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, z, target, actions, **kwargs): + Q, R = self.forward(z, target) + Q_value = self.sign * quadratify_xAx(z, Q) + R_value = self.sign * quadratify_xAx(actions, R) + gae_loss = F.mse_loss(Q_value + R_value, target, reduction="mean") + return gae_loss + + def riccati_loss_fn(self, z, target, P, **kwargs): + Q, R = self.forward(z, target) + A, B = self.linearize_pendulum_dynamics() + A = A.to(self.device) + B = B.to(self.device) + AT_P_A = torch.einsum( + "...ij, ...ik -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", A.transpose(-1, -2), P), + A, + ) + AT_P_B = torch.einsum( + "...ij, ...ik -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", A.transpose(-1, -2), P), + B, + ) + R_BT_B = R + torch.einsum("...ij, ...jk -> ...ik", B.transpose(-1, -2), B) + BT_P_A = torch.einsum( + "...ij, ...ik -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", B.transpose(-1, -2), P), + A, + ) + final_term = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", AT_P_B, torch.linalg.inv(R_BT_B)), + BT_P_A, + ) + riccati_loss = Q + AT_P_A - P - final_term + return F.mse_loss(riccati_loss, target=0.0, reduction="mean") + + def linearize_pendulum_dynamics(self, x_desired=torch.tensor([0.0, 0.0])): + m = 1.0 + b = 0.1 + length = 1.0 + g = 9.81 + ml2 = m * length**2 + + A = torch.tensor( + [[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]], + device=self.device, + ) + B = torch.tensor([[0.0], [(1.0 / ml2)]], device=self.device) + return A, B + + +class QPNet(nn.Module): + def __init__( + self, + num_obs, + hidden_dims, + latent_dim=None, + activation="elu", + dropouts=None, + normalize_obs=False, + minimize=False, + device="cuda", + **kwargs, + ): + super().__init__() + self.device = device + if latent_dim is None: + latent_dim = num_obs + self.latent_dim = latent_dim + self.minimize = minimize + if minimize: + self.sign = torch.ones(1, device=device) + else: + self.sign = -torch.ones(1, device=device) + self.value_offset = nn.Parameter(torch.zeros(1, device=device)) + # V = x'Ax + b = a* x'Ax + b, and add a regularization for det(A) ~= 1 + self.scaling_quadratic = nn.Parameter(torch.ones(1, device=device)) + + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + + self.num_lower_diag_elements = sum(range(latent_dim + 1)) + self.lower_diag_NN = create_MLP( + num_obs, + self.num_lower_diag_elements + num_obs, + hidden_dims, + activation, + dropouts, + ) # ! num_obs added to output to create c for c.T@x + # self.lower_diag_NN.apply(init_weights) + + def forward(self, x, return_all=False): + res = self.lower_diag_NN(x) + output = res[..., : self.num_lower_diag_elements] + c = res[..., self.num_lower_diag_elements :] + L = create_lower_diagonal(output, self.latent_dim, self.device) + A = compose_cholesky(nn.ReLU()(L)) + value = self.sign * quadratify_xAx(x, A) + torch.einsum( + "...ij, ...ij -> ...i", c, x + ) + value += self.value_offset + # value *= self.scaling_quadratic + + if return_all: + # return value, A, L + return {"value": value, "A": A, "L": L, "c": c} + else: + return value + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, **kwargs): + loss_NN = F.mse_loss(self.forward(obs), target, reduction="mean") + + # loss_scaling = (self.scaling_quadratic - target.mean()).pow(2) + + return loss_NN # + loss_offset + loss_scaling + + +class NN_wRiccati(nn.Module): + def __init__(self, **kwargs): + super().__init__() + # assert ( + # "critic_name" in kwargs.keys() + # ), "Name of custom critic not specified in NN with Riccati init" + # assert "action_dim" in kwargs.keys(), "Dimension of NN actions not specified." + critic_name = kwargs["critic_name"] + device = kwargs["device"] + self.has_latent = False if kwargs["latent_dim"] is None else True + self.critic = eval(f"{critic_name}(**kwargs).to(device)") + self.QR_network = QR(**kwargs).to(device) + + def forward(self, x, return_all=False): + return self.critic.forward(x, return_all) + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, actions, **kwargs): + return self.value_loss(obs, target, actions) + self.reg_loss( + obs, target, actions + ) + + def value_loss(self, obs, target, actions, **kwargs): + return self.critic.loss_fn(obs, target, actions=actions) + + def reg_loss(self, obs, target, actions, **kwargs): + with torch.no_grad(): + P = self.critic(obs, return_all=True)["A"] + z = obs if self.has_latent is False else self.critic.latent_NN(obs) + return self.QR_network.riccati_loss_fn(z, target, P=P) + + +class NN_wQR(nn.Module): + def __init__(self, **kwargs): + super().__init__() + # assert ( + # "critic_name" in kwargs.keys() + # ), "Name of custom critic not specified in NN with Riccati init" + # assert "action_dim" in kwargs.keys(), "Dimension of NN actions not specified." + critic_name = kwargs["critic_name"] + device = kwargs["device"] + self.has_latent = False if kwargs["latent_dim"] is None else True + self.critic = eval(f"{critic_name}(**kwargs).to(device)") + self.QR_network = QR(**kwargs).to(device) + + def forward(self, x, return_all=False): + return self.critic.forward(x, return_all) + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, actions, **kwargs): + return self.value_loss(obs, target, actions) + self.reg_loss( + obs, target, actions + ) + + def value_loss(self, obs, target, actions, **kwargs): + return self.critic.loss_fn(obs, target, actions=actions) + + def reg_loss(self, obs, target, actions, **kwargs): + z = obs if self.has_latent is False else self.critic.latent_NN(obs) + return self.QR_network.loss_fn(z, target, actions=actions) + + +class NN_wLinearLatent(nn.Module): + def __init__(self, **kwargs): + super().__init__() + critic_name = kwargs["critic_name"] + device = kwargs["device"] # noqa + self.critic = eval(f"{critic_name}(**kwargs).to(device)") + + @property + def value_offset(self): + return self.critic.value_offset + + def forward(self, x, return_all=False): + return self.critic.forward(x, return_all) + + def evaluate(self, obs, return_all=False): + return self.forward(obs, return_all) + + def loss_fn(self, obs, target, actions, **kwargs): + return self.value_loss(obs, target, actions) + self.reg_loss( + obs, target, actions + ) + + def value_loss(self, obs, target, actions, **kwargs): + return self.critic.loss_fn(obs, target, actions=actions) + + def reg_loss(self, obs, target, actions, **kwargs): + z = self.critic.latent_NN(obs) + u = actions.detach().clone() + u = u.view(-1, u.shape[-1]) + K, z_offset = least_squares_fit(z, u) + return F.mse_loss(forward_affine(z, K, z_offset), target=u, reduction="mean") diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index f608cb29..9505377d 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -36,3 +36,4 @@ from .critic import Critic from .chimera_actor import ChimeraActor from .smooth_actor import SmoothActor +from .QRCritics import * diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 96c53438..f7960bfa 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -1,5 +1,6 @@ import torch import torch.nn as nn + from .utils import create_MLP from .utils import RunningMeanStd @@ -29,5 +30,5 @@ def forward(self, x): def evaluate(self, critic_observations): return self.forward(critic_observations) - def loss_fn(self, obs, target): + def loss_fn(self, obs, target, **kwargs): return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index b18efe96..af782a53 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,32 +1,46 @@ import torch import os import copy -import numpy as np -def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): - activation = get_activation(activation) - +def create_MLP( + num_inputs, num_outputs, hidden_dims, activations, dropouts=None, just_list=False +): + if not isinstance(activations, list): + activations = [activations] * len(hidden_dims) if dropouts is None: dropouts = [0] * len(hidden_dims) - + elif not isinstance(dropouts, list): + dropouts = [dropouts] * len(hidden_dims) layers = [] - if not hidden_dims: # handle no hidden layers + # first layer + if len(hidden_dims) > 0: + add_layer(layers, num_inputs, hidden_dims[0], activations[0], dropouts[0]) + for i in range(len(hidden_dims) - 1): + add_layer( + layers, + hidden_dims[i], + hidden_dims[i + 1], + activations[i + 1], + dropouts[i + 1], + ) + else: + add_layer(layers, hidden_dims[-1], num_outputs) + else: # handle no hidden dims, just linear layer add_layer(layers, num_inputs, num_outputs) + + if just_list: + return layers else: - add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0]) - for i in range(len(hidden_dims)): - if i == len(hidden_dims) - 1: - add_layer(layers, hidden_dims[i], num_outputs) - else: - add_layer( - layers, - hidden_dims[i], - hidden_dims[i + 1], - activation, - dropouts[i + 1], - ) - return torch.nn.Sequential(*layers) + return torch.nn.Sequential(*layers) + + +def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): + layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) + if dropout > 0: + layer_list.append(torch.nn.Dropout(p=dropout)) + if activation is not None: + layer_list.append(get_activation(activation)) def get_activation(act_name): @@ -37,27 +51,29 @@ def get_activation(act_name): elif act_name == "relu": return torch.nn.ReLU() elif act_name == "crelu": - return torch.nn.ReLU() + return torch.nn.CELU() elif act_name == "lrelu": return torch.nn.LeakyReLU() elif act_name == "tanh": return torch.nn.Tanh() elif act_name == "sigmoid": return torch.nn.Sigmoid() + elif act_name == "softplus": + return torch.nn.Softplus() + elif act_name == "softmax": + return torch.nn.Softmax(dim=-1) + elif act_name == "randrelu": + return torch.nn.RReLU() + elif act_name == "softsign": + return torch.nn.Softsign() + elif act_name == "mish": + return torch.nn.Mish() else: print("invalid activation function!") return None -def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): - layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) - if dropout > 0: - layer_list.append(torch.nn.Dropout(p=dropout)) - if activation is not None: - layer_list.append(activation) - - -def export_network(network, network_name, path, num_inputs, latent=False): +def export_network(network, network_name, path, num_inputs): """ Thsi function traces and exports the given network module in .pt and .onnx file formats. These can be used for evaluation on other systems @@ -73,24 +89,11 @@ def export_network(network, network_name, path, num_inputs, latent=False): path_TS = os.path.join(path, network_name + ".pt") # TorchScript path path_onnx = os.path.join(path, network_name + ".onnx") # ONNX path model = copy.deepcopy(network).to("cpu") + model.device = "cpu" # force all tensors created as intermediate steps to CPU # To trace model, must be evaluated once with arbitrary input model.eval() dummy_input = torch.rand((num_inputs)) + # dummy_input = torch.rand((2, num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) - - if latent: - # Export latent model - path_latent = os.path.join(path, network_name + "_latent.onnx") - model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) - model_latent.eval() - dummy_input = torch.rand((num_inputs)) - model_traced = torch.jit.trace(model_latent, dummy_input) - torch.onnx.export(model_traced, dummy_input, path_latent) - - # Save actor std of shape (num_actions, latent_dim) - # It is important that the shape is the same as the exploration matrix - path_std = os.path.join(path, network_name + "_std.txt") - std_transposed = model.get_std.numpy().T - np.savetxt(path_std, std_transposed) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 6819f4f4..01a1b825 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -36,6 +36,9 @@ def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + remove_zero_weighted_rewards( + train_cfg["critic"]["reward"]["termination_weight"] + ) self.actor_cfg = train_cfg["actor"] self.critic_cfg = train_cfg["critic"] diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index a0840b06..6735a86d 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -33,4 +33,6 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner from .old_policy_runner import OldPolicyRunner -from .off_policy_runner import OffPolicyRunner \ No newline at end of file +from .off_policy_runner import OffPolicyRunner +from .custom_critic_runner import CustomCriticRunner +from .datalogging_runner import DataLoggingRunner diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py new file mode 100644 index 00000000..fbfb8c05 --- /dev/null +++ b/learning/runners/custom_critic_runner.py @@ -0,0 +1,178 @@ +import os +import numpy as np +import torch +from tensordict import TensorDict + +from gym import LEGGED_GYM_ROOT_DIR +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor +from learning.utils import Logger + +from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class CustomCriticRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 + critic_class_name = self.critic_cfg["critic_class_name"] + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) + self.all_obs[: self.env.num_envs, :] = actor_obs + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + start = self.env.num_envs * self.it + end = self.env.num_envs * (self.it + 1) + self.all_obs[start:end, :] = actor_obs + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "standard_training_data.npy" + if "Cholesky" not in self.critic_cfg["critic_class_name"] + else "custom_training_data.npy", + ) + + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py new file mode 100644 index 00000000..7a227d78 --- /dev/null +++ b/learning/runners/datalogging_runner.py @@ -0,0 +1,213 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.storage import DictStorage +from learning.utils import export_to_numpy + +logger = Logger() +storage = DictStorage() + + +class DataLoggingRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + self.save() + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + self.env.reset() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "terminated": terminated, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.save() + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + # if self.it % self.save_interval == 0: + # self.save() + # self.save() + # print(f"Saved data from LQR Pendulum run to {dict_save_path}") + + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index d97dee8d..b5317e2e 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -4,6 +4,10 @@ from learning.utils import Logger from .on_policy_runner import OnPolicyRunner from learning.storage import DictStorage +from learning.algorithms import PPO2 # noqa F401 +from learning.modules.actor import Actor +from learning.modules.critic import Critic # noqa F401 +from learning.modules.QRCritics import * # noqa F401 logger = Logger() storage = DictStorage() @@ -19,14 +23,24 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic_class_name = self.critic_cfg["critic_class_name"] + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() rewards_dict = {} self.alg.switch_to_train() - actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) - critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -35,10 +49,14 @@ def learn(self): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], - "dones": self.get_timed_out(), + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), } ) storage.initialize( @@ -48,6 +66,9 @@ def learn(self): device=self.device, ) + # burn in observation normalization. + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -55,7 +76,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -73,9 +94,9 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.actor_cfg["actor_obs"], self.actor_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -86,8 +107,11 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, + "terminated": terminated, "dones": dones, } ) @@ -125,3 +149,20 @@ def set_up_logger(self): logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + rewards = self.alg.critic.evaluate(critic_obs) + self.alg.critic.value_offset.copy_(rewards.mean()) + print(f"Value offset: {self.alg.critic.value_offset.item()}") + self.env.reset() diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index f601ce71..78b21225 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -42,7 +42,9 @@ def learn(self, states_to_log_dict=None): "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], - "dones": self.get_timed_out(), + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), } ) storage.initialize( @@ -106,6 +108,7 @@ def learn(self, states_to_log_dict=None): "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, + "terminated": terminated, "dones": dones, } ) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 99534c22..73039526 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -1,21 +1,29 @@ +import numpy as np import torch from tensordict import TensorDict @torch.no_grad def compute_MC_returns(data: TensorDict, gamma, critic=None): + # todo not as accurate as taking if critic is None: last_values = torch.zeros_like(data["rewards"][0]) else: last_values = critic.evaluate(data["critic_obs"][-1]) returns = torch.zeros_like(data["rewards"]) - returns[-1] = last_values * ~data["dones"][-1] + returns[-1] = data["rewards"][-1] + gamma * last_values * ~data["terminated"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done - - return normalize(returns) + if critic is not None: + returns[k] += ( + gamma + * critic.evaluate(data["critic_obs"][k]) + * data["timed_out"][k] + * ~data["terminated"][k] + ) + return returns @torch.no_grad @@ -29,7 +37,6 @@ def compute_generalized_advantages(data, gamma, lam, critic): advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this - # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] advantages[-1] = ( data["rewards"][-1] @@ -53,22 +60,16 @@ def compute_generalized_advantages(data, gamma, lam, critic): # todo change num_epochs to num_batches @torch.no_grad -def create_uniform_generator( - data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None -): +def create_uniform_generator(data, batch_size, max_gradient_steps=100): n, m = data.shape total_data = n * m if batch_size > total_data: - Warning("Batch size is larger than total data, using available data only.") batch_size = total_data num_batches_per_epoch = total_data // batch_size - if max_gradient_steps: - if max_gradient_steps < num_batches_per_epoch: - num_batches_per_epoch = max_gradient_steps - num_epochs = max_gradient_steps // num_batches_per_epoch - num_epochs = max(num_epochs, 1) + num_epochs = max_gradient_steps // num_batches_per_epoch + last_epoch_batches = max_gradient_steps - num_epochs * num_batches_per_epoch for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) @@ -77,3 +78,19 @@ def create_uniform_generator( indices[i * batch_size : (i + 1) * batch_size] ] yield batched_data + else: + indices = torch.randperm(total_data, device=data.device) + for i in range(last_epoch_batches): + batched_data = data.flatten(0, 1)[ + indices[i * batch_size : (i + 1) * batch_size] + ] + yield batched_data + + +@torch.no_grad +def export_to_numpy(data, path): + # check if path ends iwth ".npz", and if not append it. + if not path.endswith(".npz"): + path += ".npz" + np.savez_compressed(path, **{key: val.cpu().numpy() for key, val in data.items()}) + return diff --git a/requirements.txt b/requirements.txt index 3f100110..923df0e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ ruff pre-commit onnx onnxruntime -tensordict \ No newline at end of file +tensordict +optuna \ No newline at end of file From 1540cd2d5db745052cdafbf1d2cdc61b975fd6d5 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 28 Aug 2024 11:33:10 -0400 Subject: [PATCH 78/98] fix faster training configs for benchmarking --- .../mini_cheetah/mini_cheetah_osc_config.py | 4 +-- .../mini_cheetah/mini_cheetah_ref_config.py | 32 +++++++++++++++---- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index b16bf919..e244f54d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -166,7 +166,7 @@ class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = ["elu", "elu", "tanh"] smooth_exploration = False obs = [ @@ -186,7 +186,7 @@ class noise(MiniCheetahRunnerCfg.actor.noise): class critic(MiniCheetahRunnerCfg.critic): hidden_dims = [128, 128] - activation = "relu" + activation = ["relu", "tanh"] normalize_obs = False critic_class_name = "Critic" # "DenseSpectralLatent" diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index e9af92ac..69b4ee60 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -25,9 +25,9 @@ class control(MiniCheetahCfg.control): # * PD Drive parameters: stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} - gait_freq = 3.0 + gait_freq = 2.5 ctrl_frequency = 100 - desired_sim_frequency = 500 + desired_sim_frequency = 300 class commands(MiniCheetahCfg.commands): pass @@ -132,7 +132,7 @@ class weights: dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 - reference_traj = 1.5 + reference_traj = 0.0 swing_grf = 1.5 stance_grf = 1.5 @@ -140,11 +140,31 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - pass + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 36 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 1000 # number of policy updates + max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 # deprecate + num_steps_per_env = 20 # deprecate From 842b39e9647a6b9455587cc77a230964d17c55b6 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 29 Aug 2024 11:57:35 -0400 Subject: [PATCH 79/98] just tweaks --- .../mini_cheetah/mini_cheetah_ref_config.py | 17 +++++++++++++---- learning/runners/my_runner.py | 9 +++------ scripts/train.py | 1 + 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 69b4ee60..5d2a1939 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,13 +67,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "MyRunner" class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid smooth_exploration = False - exploration_sample_freq = 16 + # exploration_sample_freq = 16 activation = ["elu", "elu", "tanh"] obs = [ "base_ang_vel", @@ -82,6 +82,7 @@ class actor(MiniCheetahRunnerCfg.actor): "dof_pos_obs", "dof_vel", "phase_obs", + "dof_pos_target", ] normalize_obs = False @@ -114,6 +115,14 @@ class critic: "dof_pos_target", ] normalize_obs = False + # critic_class_name = "Critic" # "DenseSpectralLatent" + critic_class_name = "DenseSpectralLatent" + # * some class-specific params + minimize = False + relative_dim = 4 # 16 + latent_dim = 12 # 18, + latent_hidden_dims = [256, 256] + latent_activation = "elu" class reward: class weights: @@ -142,7 +151,7 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # both gamma = 0.99 - lam = 0.95 + lam = 0.75 # shared batch_size = 2**15 max_gradient_steps = 36 @@ -151,7 +160,7 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): batch_size = 2**15 # new clip_param = 0.2 - learning_rate = 1.0e-3 + learning_rate = 1.0e-2 max_grad_norm = 1.0 # Critic use_clipped_value_loss = True diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index b5317e2e..17e0817b 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -144,16 +144,16 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @torch.no_grad def burn_in_normalization(self, n_iterations=100): actor_obs = self.get_obs(self.actor_cfg["obs"]) - critic_obs = self.get_obs(self.critic_cfg["obs"]) for _ in range(n_iterations): actions = self.alg.act(actor_obs) self.set_actions(self.actor_cfg["actions"], actions) @@ -161,8 +161,5 @@ def burn_in_normalization(self, n_iterations=100): actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.critic_cfg["obs"]) - rewards = self.alg.critic.evaluate(critic_obs) - self.alg.critic.value_offset.copy_(rewards.mean()) print(f"Value offset: {self.alg.critic.value_offset.item()}") self.env.reset() diff --git a/scripts/train.py b/scripts/train.py index 759992cf..a2535408 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -31,3 +31,4 @@ def train(train_cfg, policy_runner): if __name__ == "__main__": train_cfg, policy_runner = setup() train(train_cfg=train_cfg, policy_runner=policy_runner) + print(f"Final value offset: {policy_runner.alg.critic.value_offset.item()}") From 33efaf5eda267147556449e970f7c1319ef3923f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 29 Aug 2024 13:07:29 -0400 Subject: [PATCH 80/98] SAC: typo and params that work for pendulum --- gym/envs/pendulum/pendulum_SAC_config.py | 10 +++++----- learning/modules/chimera_actor.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index d555d5df..b094ee91 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -5,7 +5,7 @@ class PendulumSACCfg(PendulumCfg): class env(PendulumCfg.env): - num_envs = 1 + num_envs = 1024 episode_length_s = 10 class init_state(PendulumCfg.init_state): @@ -76,7 +76,7 @@ class reward: class weights: theta = 0.0 omega = 0.0 - equilibrium = 1.0 + equilibrium = 2.0 energy = 0.5 dof_vel = 0.0 torques = 0.025 @@ -85,8 +85,8 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 500 - storage_size = 10**6 # 17 + initial_fill = 0 + storage_size = 100 * 1024 # 10**6 # 17 batch_size = 256 # 4096 max_gradient_steps = 1 # 10 # SB3: 1 action_max = 2.0 @@ -106,7 +106,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" - max_iterations = 40_000 # number of policy updates + max_iterations = 30_000 # number of policy updates algorithm_class_name = "SAC" save_interval = 5000 num_steps_per_env = 1 diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index 1addc83d..a9ad77d6 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -36,19 +36,19 @@ def __init__( num_inputs=num_obs, num_outputs=hidden_dims["latent"][-1], hidden_dims=hidden_dims["latent"][:-1], - activation=activation["latent"], + activations=activation["latent"], ) self.mean_NN = create_MLP( num_inputs=hidden_dims["latent"][-1], num_outputs=num_actions, hidden_dims=hidden_dims["mean"], - activation=activation["mean"], + activations=activation["mean"], ) self.std_NN = create_MLP( num_inputs=hidden_dims["latent"][-1], num_outputs=num_actions, hidden_dims=hidden_dims["std"], - activation=activation["std"], + activations=activation["std"], ) # maybe zap From fd065559b0c3fa92a9bb71faf47d8db2950b3b64 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 30 Aug 2024 11:20:34 -0400 Subject: [PATCH 81/98] update ruff settings --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a1906be6..131f359f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,8 +40,6 @@ target-version = "py38" # McCabe complexity (`C901`) by default. ignore = ["E722"] -ignore-init-module-imports = true - # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" From 3550dfd9b4b0a58b00d44587b655c6be7d1b2853 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 30 Aug 2024 11:22:10 -0400 Subject: [PATCH 82/98] address deprecation in torch.load and add lr to trackables for SAC --- learning/algorithms/sac.py | 16 +++++ learning/runners/__init__.py | 1 + learning/runners/datalogging_runner.py | 2 +- learning/runners/off_policy_runner.py | 6 +- learning/runners/old_policy_runner.py | 2 +- learning/runners/on_policy_runner.py | 2 +- learning/runners/psd_sac_runner.py | 58 +++++++++++++++++++ .../test_runner_integration.py | 2 +- 8 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 learning/runners/psd_sac_runner.py diff --git a/learning/algorithms/sac.py b/learning/algorithms/sac.py index e0651d2d..7bda74cb 100644 --- a/learning/algorithms/sac.py +++ b/learning/algorithms/sac.py @@ -74,6 +74,22 @@ def __init__( def alpha(self): return self.log_alpha.exp() + @property + def alpha_lr(self): + return self.log_alpha_optimizer.param_groups[0]["lr"] + + @property + def critic_1_lr(self): + return self.critic_1_optimizer.param_groups[0]["lr"] + + @property + def critic_2_lr(self): + return self.critic_2_optimizer.param_groups[0]["lr"] + + @property + def actor_lr(self): + return self.actor_optimizer.param_groups[0]["lr"] + def switch_to_train(self): self.actor.train() self.critic_1.train() diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index 6735a86d..657b2f25 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -36,3 +36,4 @@ from .off_policy_runner import OffPolicyRunner from .custom_critic_runner import CustomCriticRunner from .datalogging_runner import DataLoggingRunner +from .psd_sac_runner import PSACRunner \ No newline at end of file diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py index 7a227d78..2f853fae 100644 --- a/learning/runners/datalogging_runner.py +++ b/learning/runners/datalogging_runner.py @@ -191,7 +191,7 @@ def save(self): export_to_numpy(storage.data, path_data + ".npz") def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) + loaded_dict = torch.load(path, weights_only=True) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index e57be9a9..7aed58e7 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -221,6 +221,10 @@ def set_up_logger(self): "mean_actor_loss", "mean_alpha_loss", "alpha", + "alpha_lr", + "critic_1_lr", + "critic_2_lr", + "actor_lr", ], ) # logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) @@ -246,7 +250,7 @@ def save(self): torch.save(save_dict, path) def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) + loaded_dict = torch.load(path, weights_only=True) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.critic_1.load_state_dict(loaded_dict["critic_1_state_dict"]) self.alg.critic_2.load_state_dict(loaded_dict["critic_2_state_dict"]) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 42f019a3..d6fe5ddf 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -134,7 +134,7 @@ def save(self): ) def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) + loaded_dict = torch.load(path, weights_only=True) self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 78b21225..0a81680d 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -193,7 +193,7 @@ def save(self): ) def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) + loaded_dict = torch.load(path, weights_only=True) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: diff --git a/learning/runners/psd_sac_runner.py b/learning/runners/psd_sac_runner.py new file mode 100644 index 00000000..0ba89104 --- /dev/null +++ b/learning/runners/psd_sac_runner.py @@ -0,0 +1,58 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from learning.runners import OffPolicyRunner +from learning.modules import Critic, ChimeraActor +from learning.modules.QRCritics import * # noqa F401 + +from learning.storage import ReplayBuffer +from learning.algorithms import SAC + + +logger = Logger() +storage = ReplayBuffer() + + +class PSACRunner(OffPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_in = self.get_obs_size(self.critic_cfg["obs"]) + num_actions # noqa: F841 + actor = ChimeraActor(num_actor_obs, num_actions, **self.actor_cfg) + + critic_class = self.critic_cfg["critic_class_name"] + critic_1 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + critic_2 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + target_critic_1 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + target_critic_2 = eval(f"{critic_class}(num_critic_in, **self.critic_cfg)") + + # critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + # critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + # target_critic_1 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + # target_critic_2 = Critic(num_critic_obs + num_actions, **self.critic_cfg) + + print(actor) + print(critic_1) + + self.alg = SAC( + actor, + critic_1, + critic_2, + target_critic_1, + target_critic_2, + device=self.device, + **self.alg_cfg, + ) diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 3310624f..ce645b5c 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -35,7 +35,7 @@ def load_saved_policy(runner): load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) - loaded_dict = torch.load(resume_path) + loaded_dict = torch.load(resume_path, weights_only=True) actor.load_state_dict(loaded_dict["actor_state_dict"]) actor.eval() return actor From 4247ea7bf94d3c3cfe551020e3cc9cd27f16a490 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 30 Aug 2024 11:24:13 -0400 Subject: [PATCH 83/98] allow randomize_episode_counters for reset_to_uniform in pendulum --- gym/envs/pendulum/pendulum.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index a3bbc355..90bc60ae 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -39,8 +39,8 @@ def reset_to_uniform(self, env_ids): device=self.device, ) grid = torch.cartesian_prod(lin_pos, lin_vel) - self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) - self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) + self.dof_pos[env_ids] = grid[env_ids, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[env_ids, 1].unsqueeze(-1) def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling @@ -54,8 +54,8 @@ def _reward_equilibrium(self): # theta_norm = self._normalize_theta() # omega = self.dof_vel[:, 0] # error = torch.stack( - # [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 - # ) + # [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], + # dim=1) # return self._sqrdexp(torch.mean(error, dim=1), 0.01) # todo compare alternatives From 66c9e1b3b71c7067a737b88326ff249240772f8e Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 3 Sep 2024 09:35:45 -0400 Subject: [PATCH 84/98] some slimmin and separation of utils --- gym/envs/__init__.py | 3 + gym/envs/pendulum/pendulum_PSD_config.py | 121 +++++++++++++++++++++++ learning/modules/QRCritics.py | 60 ++--------- learning/modules/__init__.py | 1 + learning/modules/utils/__init__.py | 1 + learning/modules/utils/matfncs.py | 53 ++++++++++ 6 files changed, 187 insertions(+), 52 deletions(-) create mode 100644 gym/envs/pendulum/pendulum_PSD_config.py create mode 100644 learning/modules/utils/matfncs.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 1b78a261..0aa7ce31 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -34,6 +34,7 @@ "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", "PendulumCfg": ".pendulum.pendulum_config", "PendulumSACCfg": ".pendulum.pendulum_SAC_config", + "PendulumPSDCfg": ".pendulum.pendulum_PSD_config", } runner_config_dict = { @@ -48,6 +49,7 @@ "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config", + "PendulumPSDRunnerCfg": ".pendulum.pendulum_PSD_config", } task_dict = { @@ -77,6 +79,7 @@ "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"], + "psd_pendulum": ["Pendulum", "PendulumPSDCfg", "PendulumPSDRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/pendulum/pendulum_PSD_config.py b/gym/envs/pendulum/pendulum_PSD_config.py new file mode 100644 index 00000000..3de1ccca --- /dev/null +++ b/gym/envs/pendulum/pendulum_PSD_config.py @@ -0,0 +1,121 @@ +import torch +from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO +from gym.envs.pendulum.pendulum_config import PendulumCfg + + +class PendulumPSDCfg(PendulumCfg): + class env(PendulumCfg.env): + num_envs = 1024 + episode_length_s = 10 + + class init_state(PendulumCfg.init_state): + reset_mode = "reset_to_uniform" + default_joint_angles = {"theta": 0.0} + dof_pos_range = { + "theta": [-torch.pi, torch.pi], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(PendulumCfg.control): + ctrl_frequency = 10 + desired_sim_frequency = 100 + + class asset(PendulumCfg.asset): + joint_damping = 0.1 + + class reward_settings(PendulumCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(PendulumCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + tau_ff = 1.0 + toruqes = 2.5 # 5.0 + + +class PendulumPSDRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "PSACRunner" + + class actor: + hidden_dims = { + "latent": [128, 64], + "mean": [32], + "std": [32], + } + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = { + "latent": "elu", + "mean": "elu", + "std": "elu", + } + + normalize_obs = False + obs = [ + "dof_pos_obs", + "dof_vel", + ] + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + obs = ["dof_pos_obs", "dof_vel"] + hidden_dims = [16, 16] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + # TODO[lm]: Current normalization uses torch.no_grad, this should be changed + normalize_obs = False + + # critic_class_name = "CholeskyLatent" + critic_class_name = "DenseSpectralLatent" + # * some class-specific params + minimize = False + relative_dim = 4 # 16 + latent_dim = 4 # 18, + dropouts = [0.1, 0.0] + latent_hidden_dims = [16] + latent_activation = "elu" + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.5 + dof_vel = 0.0 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + initial_fill = 0 + storage_size = 100 * 1024 # steps_per_episode * num_envs + batch_size = 1024 # 4096 + max_gradient_steps = 10 # 10 # SB3: 1 + action_max = 2.0 + action_min = -2.0 + actor_noise_std = 1.0 + log_std_max = 4.0 + log_std_min = -20.0 + alpha = 0.2 + target_entropy = -1.0 + max_grad_norm = 1.0 + polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) + gamma = 0.99 + alpha_lr = 1e-4 + actor_lr = 1e-3 + critic_lr = 5e-4 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "sac_pendulum" + max_iterations = 10_000 # number of policy updates + algorithm_class_name = "SAC" + num_steps_per_env = 1 + save_interval = 2000 + log_storage = True diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py index a65ec304..3933f4e3 100644 --- a/learning/modules/QRCritics.py +++ b/learning/modules/QRCritics.py @@ -3,36 +3,14 @@ import torch.nn.functional as F from learning.modules.utils import RunningMeanStd, create_MLP - - -def create_lower_diagonal(x, n, device): - L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) - tril_indices = torch.tril_indices(row=n, col=n, offset=0) - rows, cols = tril_indices - L[..., rows, cols] = x - return L - - -def create_PD_lower_diagonal(x, n, device): - tril_indices = torch.tril_indices(row=n, col=n, offset=0) - diag_indices = (tril_indices[0] == tril_indices[1]).nonzero(as_tuple=True)[0] - x[..., diag_indices] = F.softplus(x[..., diag_indices]) - L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) - rows, cols = tril_indices - L[..., rows, cols] = x - return L - - -def compose_cholesky(L): - return torch.einsum("...ij, ...jk -> ...ik", L, L.transpose(-2, -1)) - - -def quadratify_xAx(x, A): - return torch.einsum( - "...ij, ...jk -> ...ik", - torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), - x.unsqueeze(-1), - ).squeeze() +from learning.modules.utils import ( + quadratify_xAx, + compose_cholesky, + create_lower_diagonal, + least_squares_fit, + forward_affine, + create_PD_lower_diagonal, +) def init_weights(m): @@ -41,28 +19,6 @@ def init_weights(m): m.bias.data.fill_(0.01) -def least_squares_fit(x, y): - x_flat = x.view(-1, x.shape[-1]) - y_flat = y.view(-1, y.shape[-1]) - ones = torch.ones(x_flat.shape[0], 1, device=x.device) - x_aug = torch.cat([ones, x_flat], dim=1) - # Use torch.linalg.lstsq to find the least-squares solution - result = torch.linalg.lstsq(x_aug, y_flat) - coefficients = result.solution - bias = coefficients[0] - weights = coefficients[1:] - - return weights, bias - - -def forward_affine(x, weights, bias): - x_flat = x.view(-1, x.shape[-1]) - x_aug = torch.cat( - [torch.ones(x_flat.shape[0], 1, device=x_flat.device), x_flat], dim=1 - ) - return x_aug.matmul(torch.cat([bias.unsqueeze(0), weights], dim=0)) - - class Critic(nn.Module): def __init__( self, diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 9505377d..8281ab8d 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -37,3 +37,4 @@ from .chimera_actor import ChimeraActor from .smooth_actor import SmoothActor from .QRCritics import * +from .TaylorCritics import * \ No newline at end of file diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py index 1422b82d..35868ca0 100644 --- a/learning/modules/utils/__init__.py +++ b/learning/modules/utils/__init__.py @@ -1,2 +1,3 @@ from .neural_net import create_MLP, export_network from .normalize import RunningMeanStd +from .matfncs import create_lower_diagonal, create_PD_lower_diagonal, compose_cholesky, quadratify_xAx, least_squares_fit, forward_affine \ No newline at end of file diff --git a/learning/modules/utils/matfncs.py b/learning/modules/utils/matfncs.py new file mode 100644 index 00000000..92ebd0e7 --- /dev/null +++ b/learning/modules/utils/matfncs.py @@ -0,0 +1,53 @@ +import torch + + +def create_lower_diagonal(x, n, device): + L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + +def create_PD_lower_diagonal(x, n, device): + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + diag_indices = (tril_indices[0] == tril_indices[1]).nonzero(as_tuple=True)[0] + x[..., diag_indices] = F.softplus(x[..., diag_indices]) + L = torch.zeros((*x.shape[:-1], n, n), device=device, requires_grad=False) + rows, cols = tril_indices + L[..., rows, cols] = x + return L + + +def compose_cholesky(L): + return torch.einsum("...ij, ...jk -> ...ik", L, L.transpose(-2, -1)) + + +def quadratify_xAx(x, A): + return torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ).squeeze() + + +def least_squares_fit(x, y): + x_flat = x.view(-1, x.shape[-1]) + y_flat = y.view(-1, y.shape[-1]) + ones = torch.ones(x_flat.shape[0], 1, device=x.device) + x_aug = torch.cat([ones, x_flat], dim=1) + # Use torch.linalg.lstsq to find the least-squares solution + result = torch.linalg.lstsq(x_aug, y_flat) + coefficients = result.solution + bias = coefficients[0] + weights = coefficients[1:] + + return weights, bias + + +def forward_affine(x, weights, bias): + x_flat = x.view(-1, x.shape[-1]) + x_aug = torch.cat( + [torch.ones(x_flat.shape[0], 1, device=x_flat.device), x_flat], dim=1 + ) + return x_aug.matmul(torch.cat([bias.unsqueeze(0), weights], dim=0)) From dbd48c9b5fc1555ffb88e0a8bd09488eeef8f38e Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 3 Sep 2024 15:05:50 -0400 Subject: [PATCH 85/98] add layer norm to create_MLP, +kwargs flexibility --- learning/modules/actor.py | 2 +- learning/modules/critic.py | 2 +- learning/modules/utils/neural_net.py | 32 ++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 14104df7..0f112dba 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -27,7 +27,7 @@ def __init__( self.num_actions = num_actions self.hidden_dims = hidden_dims self.activation = activation - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation, **kwargs) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index f7960bfa..5dfad32d 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -16,7 +16,7 @@ def __init__( ): super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index af782a53..27b47a54 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -4,18 +4,33 @@ def create_MLP( - num_inputs, num_outputs, hidden_dims, activations, dropouts=None, just_list=False + num_inputs, + num_outputs, + hidden_dims, + activations=None, + dropouts=0.0, + layer_norm=False, + just_list=False, + **kwargs, ): if not isinstance(activations, list): activations = [activations] * len(hidden_dims) - if dropouts is None: - dropouts = [0] * len(hidden_dims) - elif not isinstance(dropouts, list): + if not isinstance(dropouts, list): dropouts = [dropouts] * len(hidden_dims) + if not isinstance(layer_norm, list): + layer_norm = [layer_norm] * len(hidden_dims) + layers = [] # first layer if len(hidden_dims) > 0: - add_layer(layers, num_inputs, hidden_dims[0], activations[0], dropouts[0]) + add_layer( + layers, + num_inputs, + hidden_dims[0], + activations[0], + dropouts[0], + layer_norm[0], + ) for i in range(len(hidden_dims) - 1): add_layer( layers, @@ -23,6 +38,7 @@ def create_MLP( hidden_dims[i + 1], activations[i + 1], dropouts[i + 1], + layer_norm=layer_norm[i + 1], ) else: add_layer(layers, hidden_dims[-1], num_outputs) @@ -35,8 +51,12 @@ def create_MLP( return torch.nn.Sequential(*layers) -def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): +def add_layer( + layer_list, num_inputs, num_outputs, activation=None, dropout=0, layer_norm=False +): layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) + if layer_norm: + layer_list.append(torch.nn.LayerNorm(num_outputs)) if dropout > 0: layer_list.append(torch.nn.Dropout(p=dropout)) if activation is not None: From 7d970816aef13ea0c32af05e6e64f3687940d2bc Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 3 Sep 2024 15:07:46 -0400 Subject: [PATCH 86/98] update for layernorm --- gym/envs/pendulum/pendulum_PSD_config.py | 34 +++++++++++------------- learning/modules/QRCritics.py | 24 ++++++++--------- learning/modules/chimera_actor.py | 20 +++++++------- learning/modules/state_estimator.py | 2 +- 4 files changed, 38 insertions(+), 42 deletions(-) diff --git a/gym/envs/pendulum/pendulum_PSD_config.py b/gym/envs/pendulum/pendulum_PSD_config.py index 3de1ccca..056070a4 100644 --- a/gym/envs/pendulum/pendulum_PSD_config.py +++ b/gym/envs/pendulum/pendulum_PSD_config.py @@ -27,10 +27,10 @@ class reward_settings(PendulumCfg.reward_settings): tracking_sigma = 0.25 class scaling(PendulumCfg.scaling): - dof_vel = 5.0 - dof_pos = 2.0 * torch.pi + dof_vel = 25.0 + dof_pos = 10.0 * torch.pi tau_ff = 1.0 - toruqes = 2.5 # 5.0 + torques = 2.5 # 5.0 class PendulumPSDRunnerCfg(FixedRobotCfgPPO): @@ -38,17 +38,14 @@ class PendulumPSDRunnerCfg(FixedRobotCfgPPO): runner_class_name = "PSACRunner" class actor: - hidden_dims = { - "latent": [128, 64], - "mean": [32], - "std": [32], - } - # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = { - "latent": "elu", - "mean": "elu", - "std": "elu", - } + latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True} + mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} + std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} + nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn} + # hidden_dims = {"latent": [128, 64], "mean": [32], "std": [32]} + # # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + # activation = {"latent": "elu", "mean": "elu", "std": "elu"} + # layer_norm = {"latent": True, "mean": True, "std": True} normalize_obs = False obs = [ @@ -66,17 +63,18 @@ class critic: obs = ["dof_pos_obs", "dof_vel"] hidden_dims = [16, 16] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + dropouts = [0.1, 0.0] + layer_norm = [True, True] activation = "elu" # TODO[lm]: Current normalization uses torch.no_grad, this should be changed normalize_obs = False # critic_class_name = "CholeskyLatent" - critic_class_name = "DenseSpectralLatent" + critic_class_name = "Critic" # * some class-specific params minimize = False relative_dim = 4 # 16 latent_dim = 4 # 18, - dropouts = [0.1, 0.0] latent_hidden_dims = [16] latent_activation = "elu" @@ -114,8 +112,8 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "sac_pendulum" - max_iterations = 10_000 # number of policy updates + max_iterations = 5_000 # number of policy updates algorithm_class_name = "SAC" num_steps_per_env = 1 - save_interval = 2000 + save_interval = 500 log_storage = True diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py index 3933f4e3..d2ed9378 100644 --- a/learning/modules/QRCritics.py +++ b/learning/modules/QRCritics.py @@ -31,7 +31,7 @@ def __init__( ): super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) @@ -61,7 +61,7 @@ def __init__( hidden_dims, latent_dim=None, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, minimize=False, device="cuda", @@ -118,7 +118,7 @@ def __init__( hidden_dims, latent_dim=None, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, minimize=False, latent_hidden_dims=[128, 128], @@ -163,7 +163,7 @@ def __init__( hidden_dims, latent_dim=None, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, minimize=False, device="cuda", @@ -223,9 +223,9 @@ def __init__( latent_dim, latent_hidden_dims=[64], latent_activation="tanh", - latent_dropouts=None, + latent_dropouts=0.0, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, minimize=False, device="cuda", @@ -314,13 +314,13 @@ def __init__( num_obs, hidden_dims, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, relative_dim=None, latent_dim=None, latent_hidden_dims=[64], latent_activation="tanh", - latent_dropouts=None, + latent_dropouts=0.0, minimize=False, device="cuda", **kwargs, @@ -414,13 +414,13 @@ def __init__( num_obs, hidden_dims, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, relative_dim=None, latent_dim=None, latent_hidden_dims=[64], latent_activation="tanh", - latent_dropouts=None, + latent_dropouts=0.0, minimize=False, device="cuda", **kwargs, @@ -531,7 +531,7 @@ def __init__( action_dim, latent_dim=None, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, minimize=False, device="cuda", @@ -643,7 +643,7 @@ def __init__( hidden_dims, latent_dim=None, activation="elu", - dropouts=None, + dropouts=0.0, normalize_obs=False, minimize=False, device="cuda", diff --git a/learning/modules/chimera_actor.py b/learning/modules/chimera_actor.py index a9ad77d6..1d5ead41 100644 --- a/learning/modules/chimera_actor.py +++ b/learning/modules/chimera_actor.py @@ -11,8 +11,9 @@ def __init__( self, num_obs, num_actions, - hidden_dims, - activation, + # hidden_dims, + # activation, + nn_params, std_init=1.0, log_std_max=4.0, log_std_min=-20.0, @@ -34,21 +35,18 @@ def __init__( self.latent_NN = create_MLP( num_inputs=num_obs, - num_outputs=hidden_dims["latent"][-1], - hidden_dims=hidden_dims["latent"][:-1], - activations=activation["latent"], + num_outputs=nn_params["latent"]["hidden_dims"][-1], + **nn_params["latent"], ) self.mean_NN = create_MLP( - num_inputs=hidden_dims["latent"][-1], + num_inputs=nn_params["latent"]["hidden_dims"][-1], num_outputs=num_actions, - hidden_dims=hidden_dims["mean"], - activations=activation["mean"], + **nn_params["mean"], ) self.std_NN = create_MLP( - num_inputs=hidden_dims["latent"][-1], + num_inputs=nn_params["latent"]["hidden_dims"][-1], num_outputs=num_actions, - hidden_dims=hidden_dims["std"], - activations=activation["std"], + **nn_params["std"], ) # maybe zap diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py index f8cfe221..dfba137d 100644 --- a/learning/modules/state_estimator.py +++ b/learning/modules/state_estimator.py @@ -20,7 +20,7 @@ def __init__( num_outputs, hidden_dims=[256, 128], activation="elu", - dropouts=None, + dropouts=0.0, **kwargs, ): if kwargs: From 032ac8ebc66c4ddc3f4a9379a91b3731fc204ead Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 3 Sep 2024 15:18:47 -0400 Subject: [PATCH 87/98] add layer norm to create_MLP, +kwargs flexibility --- learning/modules/actor.py | 2 +- learning/modules/critic.py | 2 +- learning/modules/utils/neural_net.py | 113 ++++++++++++++++----------- 3 files changed, 70 insertions(+), 47 deletions(-) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 14104df7..0f112dba 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -27,7 +27,7 @@ def __init__( self.num_actions = num_actions self.hidden_dims = hidden_dims self.activation = activation - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation, **kwargs) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 96c53438..edca8875 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -15,7 +15,7 @@ def __init__( ): super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, **kwargs) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index b18efe96..27b47a54 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,32 +1,66 @@ import torch import os import copy -import numpy as np -def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): - activation = get_activation(activation) - - if dropouts is None: - dropouts = [0] * len(hidden_dims) +def create_MLP( + num_inputs, + num_outputs, + hidden_dims, + activations=None, + dropouts=0.0, + layer_norm=False, + just_list=False, + **kwargs, +): + if not isinstance(activations, list): + activations = [activations] * len(hidden_dims) + if not isinstance(dropouts, list): + dropouts = [dropouts] * len(hidden_dims) + if not isinstance(layer_norm, list): + layer_norm = [layer_norm] * len(hidden_dims) layers = [] - if not hidden_dims: # handle no hidden layers + # first layer + if len(hidden_dims) > 0: + add_layer( + layers, + num_inputs, + hidden_dims[0], + activations[0], + dropouts[0], + layer_norm[0], + ) + for i in range(len(hidden_dims) - 1): + add_layer( + layers, + hidden_dims[i], + hidden_dims[i + 1], + activations[i + 1], + dropouts[i + 1], + layer_norm=layer_norm[i + 1], + ) + else: + add_layer(layers, hidden_dims[-1], num_outputs) + else: # handle no hidden dims, just linear layer add_layer(layers, num_inputs, num_outputs) + + if just_list: + return layers else: - add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0]) - for i in range(len(hidden_dims)): - if i == len(hidden_dims) - 1: - add_layer(layers, hidden_dims[i], num_outputs) - else: - add_layer( - layers, - hidden_dims[i], - hidden_dims[i + 1], - activation, - dropouts[i + 1], - ) - return torch.nn.Sequential(*layers) + return torch.nn.Sequential(*layers) + + +def add_layer( + layer_list, num_inputs, num_outputs, activation=None, dropout=0, layer_norm=False +): + layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) + if layer_norm: + layer_list.append(torch.nn.LayerNorm(num_outputs)) + if dropout > 0: + layer_list.append(torch.nn.Dropout(p=dropout)) + if activation is not None: + layer_list.append(get_activation(activation)) def get_activation(act_name): @@ -37,27 +71,29 @@ def get_activation(act_name): elif act_name == "relu": return torch.nn.ReLU() elif act_name == "crelu": - return torch.nn.ReLU() + return torch.nn.CELU() elif act_name == "lrelu": return torch.nn.LeakyReLU() elif act_name == "tanh": return torch.nn.Tanh() elif act_name == "sigmoid": return torch.nn.Sigmoid() + elif act_name == "softplus": + return torch.nn.Softplus() + elif act_name == "softmax": + return torch.nn.Softmax(dim=-1) + elif act_name == "randrelu": + return torch.nn.RReLU() + elif act_name == "softsign": + return torch.nn.Softsign() + elif act_name == "mish": + return torch.nn.Mish() else: print("invalid activation function!") return None -def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): - layer_list.append(torch.nn.Linear(num_inputs, num_outputs)) - if dropout > 0: - layer_list.append(torch.nn.Dropout(p=dropout)) - if activation is not None: - layer_list.append(activation) - - -def export_network(network, network_name, path, num_inputs, latent=False): +def export_network(network, network_name, path, num_inputs): """ Thsi function traces and exports the given network module in .pt and .onnx file formats. These can be used for evaluation on other systems @@ -73,24 +109,11 @@ def export_network(network, network_name, path, num_inputs, latent=False): path_TS = os.path.join(path, network_name + ".pt") # TorchScript path path_onnx = os.path.join(path, network_name + ".onnx") # ONNX path model = copy.deepcopy(network).to("cpu") + model.device = "cpu" # force all tensors created as intermediate steps to CPU # To trace model, must be evaluated once with arbitrary input model.eval() dummy_input = torch.rand((num_inputs)) + # dummy_input = torch.rand((2, num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) - - if latent: - # Export latent model - path_latent = os.path.join(path, network_name + "_latent.onnx") - model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) - model_latent.eval() - dummy_input = torch.rand((num_inputs)) - model_traced = torch.jit.trace(model_latent, dummy_input) - torch.onnx.export(model_traced, dummy_input, path_latent) - - # Save actor std of shape (num_actions, latent_dim) - # It is important that the shape is the same as the exploration matrix - path_std = os.path.join(path, network_name + "_std.txt") - std_transposed = model.get_std.numpy().T - np.savetxt(path_std, std_transposed) From be2e990c6ec3d2887e3f89e856e66d5284f5bdad Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 4 Sep 2024 09:58:49 -0400 Subject: [PATCH 88/98] wip --- gym/envs/mit_humanoid/lander.py | 16 +++++++++++++++ gym/envs/mit_humanoid/lander_config.py | 28 +++++++++----------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py index 972eb413..f86d30f4 100644 --- a/gym/envs/mit_humanoid/lander.py +++ b/gym/envs/mit_humanoid/lander.py @@ -2,12 +2,28 @@ # from gym.envs.base.legged_robot import LeggedRobot from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid +from isaacgym.torch_utils import torch_rand_float class Lander(MIT_Humanoid): def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + def _resample_commands(self, env_ids): + """Randommly select commands of some environments + + Args: + env_ids (List[int]): Environments ids for which new commands are needed + """ + if len(env_ids) == 0: + return + super()._resample_commands(env_ids) + # * with 75% chance, reset to 0 + self.commands[env_ids, :] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.25 + ).unsqueeze(1) + # --- rewards --- def _switch(self, mode=None): diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index 0de34b4c..8219d459 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -112,7 +112,7 @@ class control(LeggedRobotCfg.control): } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 1000 + desired_sim_frequency = 500 # class oscillator: # base_frequency = 3.0 # [Hz] @@ -121,13 +121,13 @@ class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] - lin_vel_y = 0.0 # max [m/s] - yaw_vel = 0.0 # max [rad/s] + lin_vel_x = [-2.0, 2.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.3 # max [m/s] + yaw_vel = 1.0 # max [rad/s] class push_robots: toggle = True - interval_s = 1 + interval_s = 2 max_push_vel_xy = 0.5 push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] @@ -146,13 +146,9 @@ class asset(LeggedRobotCfg.asset): foot_name = "foot" penalize_contacts_on = ["arm", "hand", "shoulder"] terminate_after_contacts_on = ["base"] - end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter collapse_fixed_joints = False - # * see GymDofDriveModeFlags - # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) - default_dof_drive_mode = 3 fix_base_link = False disable_gravity = False disable_motors = False @@ -279,24 +275,18 @@ class critic(LeggedRobotRunnerCfg.critic): class reward: class weights: - # tracking_lin_vel = 0.0 - # tracking_ang_vel = 1.5 - # orientation = 1.0 torques = 5.0e-5 - power = 0 # 1.0e-2 + power = 1e-6 # 1.0e-2 min_base_height = 1.5 + lin_vel_xy = 1.0 action_rate = 1e-3 action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 dof_vel = 0.5 - # stand_still = 0.25 dof_pos_limits = 0.25 - dof_near_home = 0.25 - # stance = 1.0 - # swing = 1.0 + dof_near_home = 0.75 hips_forward = 0.0 - # walk_freq = 0.0 # 2.5 collision = 1.0 class termination_weight: @@ -330,6 +320,6 @@ class runner(LeggedRobotRunnerCfg.runner): algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 - run_name = "Standing" + run_name = "lander" experiment_name = "Humanoid" save_interval = 50 From 27991100e4388ea3df415b75f4b022a599747b51 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 4 Sep 2024 15:42:46 -0400 Subject: [PATCH 89/98] Refactor for action frequency handled in runner only for lander: needs to overload _check_terminations_and_timeouts to not actually reset so that reset can be handled only by runner (without breaking backwards compatibility) Also keeps a list of rewards, for finer reward integration, and can also do traj stats --- gym/envs/mit_humanoid/lander.py | 7 ++ gym/envs/mit_humanoid/lander_config.py | 15 +-- gym/utils/helpers.py | 2 +- learning/runners/my_runner.py | 128 +++++++++++++++++++++---- 4 files changed, 127 insertions(+), 25 deletions(-) diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py index f86d30f4..86d37321 100644 --- a/gym/envs/mit_humanoid/lander.py +++ b/gym/envs/mit_humanoid/lander.py @@ -24,6 +24,13 @@ def _resample_commands(self, env_ids): < 0.25 ).unsqueeze(1) + def _check_terminations_and_timeouts(self): + """Check if environments need to be reset""" + contact_forces = self.contact_forces[:, self.termination_contact_indices, :] + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.timed_out = self.episode_length_buf >= self.max_episode_length + # self.to_be_reset = self.timed_out | self.terminated + # --- rewards --- def _switch(self, mode=None): diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index 8219d459..48c2eb53 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -111,7 +111,7 @@ class control(LeggedRobotCfg.control): "elbow": 1.0, } # [N*m*s/rad] - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 # class oscillator: @@ -218,13 +218,15 @@ class scaling(LeggedRobotCfg.scaling): class LanderRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "MyRunner" class actor(LeggedRobotRunnerCfg.actor): + frequency = 100 init_noise_std = 1.0 hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = ["elu", "elu", "tanh"] + layer_norm = True smooth_exploration = False obs = [ @@ -256,7 +258,8 @@ class noise: class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" + layer_norm = True obs = [ # "base_height", @@ -290,7 +293,7 @@ class weights: collision = 1.0 class termination_weight: - termination = 15 + termination = 1.0 class algorithm(LeggedRobotRunnerCfg.algorithm): # both @@ -319,7 +322,7 @@ class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" num_steps_per_env = 24 - max_iterations = 1000 + max_iterations = 500 run_name = "lander" experiment_name = "Humanoid" save_interval = 50 diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 90af8bce..8e74d93c 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -191,7 +191,7 @@ def get_args(custom_parameters=None): { "name": "--task", "type": str, - "default": "cartpole", + "default": "lander", "help": "Resume training or start testing from a checkpoint. " "Overrides config file if provided.", }, diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index d97dee8d..96eedc77 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -19,14 +19,18 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) - def learn(self): + def learn(self, states_to_log_dict=None): self.set_up_logger() + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + rewards_dict = {} + rewards_list = [dict() for x in range(n_policy_steps)] self.alg.switch_to_train() - actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) - critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -35,8 +39,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -52,10 +58,17 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -69,33 +82,48 @@ def learn(self): "critic_obs": critic_obs, } ) - - self.env.step() + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards(rewards_list[step], self.env.terminated) + # handle time-outs/terminations happening inside here with mask + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) actor_obs = self.get_noisy_obs( - self.actor_cfg["actor_obs"], self.actor_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + self.update_rewards(rewards_dict, self.env.terminated) + + # integrate rewards from list + total_rewards = torch.stack( + tuple( + torch.stack(tuple(rewards_dict.values())) + for rewards_dict in rewards_list + ) + ).sum(dim=(0, 1)) transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") @@ -113,6 +141,21 @@ def learn(self): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + def set_up_logger(self): logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( @@ -120,8 +163,57 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def update_rewards(self, rewards_dict, terminated): + # sum existing rewards with new rewards + + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + + with torch.inference_mode(): + for i in range(steps): + actions = self.alg.act(actor_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] From 91395f47aca7c013802c35f0bab72f3c640fe2f7 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 5 Sep 2024 10:47:18 -0400 Subject: [PATCH 90/98] update torch.load with weights_only=True for depercation --- learning/runners/old_policy_runner.py | 2 +- learning/runners/on_policy_runner.py | 2 +- tests/integration_tests/test_runner_integration.py | 2 +- tests/regression_tests/test_generated_torch_files.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 42f019a3..d6fe5ddf 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -134,7 +134,7 @@ def save(self): ) def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) + loaded_dict = torch.load(path, weights_only=True) self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index f601ce71..568173d5 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -190,7 +190,7 @@ def save(self): ) def load(self, path, load_optimizer=True): - loaded_dict = torch.load(path) + loaded_dict = torch.load(path, weights_only=True) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 3310624f..ce645b5c 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -35,7 +35,7 @@ def load_saved_policy(runner): load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) - loaded_dict = torch.load(resume_path) + loaded_dict = torch.load(resume_path, weights_only=True) actor.load_state_dict(loaded_dict["actor_state_dict"]) actor.eval() return actor diff --git a/tests/regression_tests/test_generated_torch_files.py b/tests/regression_tests/test_generated_torch_files.py index ef8ddc15..11b9ff5a 100644 --- a/tests/regression_tests/test_generated_torch_files.py +++ b/tests/regression_tests/test_generated_torch_files.py @@ -2,8 +2,8 @@ def compare_tensors(file1, file2): - tensor1 = torch.load(file1) - tensor2 = torch.load(file2) + tensor1 = torch.load(file1, weights_only=True) + tensor2 = torch.load(file2, weights_only=True) assert torch.all( torch.eq(tensor1, tensor2) ), f"Tensors in {file1} and {file2} are not equal." From 0979fab4e98b83c6a1132dae002e08ff22d57cea Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 6 Sep 2024 16:31:38 -0400 Subject: [PATCH 91/98] working SAC on pendulum, ready for merge --- gym/envs/pendulum/pendulum_PSD_config.py | 107 ++++++----------------- gym/envs/pendulum/pendulum_SAC_config.py | 15 +--- learning/runners/psd_sac_runner.py | 1 + scripts/train.py | 1 - 4 files changed, 30 insertions(+), 94 deletions(-) diff --git a/gym/envs/pendulum/pendulum_PSD_config.py b/gym/envs/pendulum/pendulum_PSD_config.py index 056070a4..083588c8 100644 --- a/gym/envs/pendulum/pendulum_PSD_config.py +++ b/gym/envs/pendulum/pendulum_PSD_config.py @@ -1,65 +1,34 @@ -import torch -from gym.envs.base.fixed_robot_config import FixedRobotCfgPPO -from gym.envs.pendulum.pendulum_config import PendulumCfg +from gym.envs.pendulum.pendulum_SAC_config import PendulumSACCfg, PendulumSACRunnerCfg -class PendulumPSDCfg(PendulumCfg): - class env(PendulumCfg.env): - num_envs = 1024 - episode_length_s = 10 +class PendulumPSDCfg(PendulumSACCfg): + class env(PendulumSACCfg.env): + pass - class init_state(PendulumCfg.init_state): - reset_mode = "reset_to_uniform" - default_joint_angles = {"theta": 0.0} - dof_pos_range = { - "theta": [-torch.pi, torch.pi], - } - dof_vel_range = {"theta": [-5, 5]} + class init_state(PendulumSACCfg.init_state): + pass - class control(PendulumCfg.control): - ctrl_frequency = 10 - desired_sim_frequency = 100 + class control(PendulumSACCfg.control): + pass - class asset(PendulumCfg.asset): - joint_damping = 0.1 + class asset(PendulumSACCfg.asset): + pass - class reward_settings(PendulumCfg.reward_settings): - tracking_sigma = 0.25 + class reward_settings(PendulumSACCfg.reward_settings): + pass - class scaling(PendulumCfg.scaling): - dof_vel = 25.0 - dof_pos = 10.0 * torch.pi - tau_ff = 1.0 - torques = 2.5 # 5.0 + class scaling(PendulumSACCfg.scaling): + pass -class PendulumPSDRunnerCfg(FixedRobotCfgPPO): +class PendulumPSDRunnerCfg(PendulumSACRunnerCfg): seed = -1 runner_class_name = "PSACRunner" - class actor: - latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True} - mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} - std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} - nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn} - # hidden_dims = {"latent": [128, 64], "mean": [32], "std": [32]} - # # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - # activation = {"latent": "elu", "mean": "elu", "std": "elu"} - # layer_norm = {"latent": True, "mean": True, "std": True} + class actor(PendulumSACRunnerCfg.actor): + pass - normalize_obs = False - obs = [ - "dof_pos_obs", - "dof_vel", - ] - actions = ["tau_ff"] - disable_actions = False - - class noise: - dof_pos = 0.0 - dof_vel = 0.0 - - class critic: + class critic(PendulumSACRunnerCfg.critic): obs = ["dof_pos_obs", "dof_vel"] hidden_dims = [16, 16] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid @@ -70,7 +39,8 @@ class critic: normalize_obs = False # critic_class_name = "CholeskyLatent" - critic_class_name = "Critic" + # critic_class_name = "Critic" + critic_class_name = "DenseSpectralLatent" # * some class-specific params minimize = False relative_dim = 4 # 16 @@ -78,41 +48,14 @@ class critic: latent_hidden_dims = [16] latent_activation = "elu" - class reward: - class weights: - theta = 0.0 - omega = 0.0 - equilibrium = 1.0 - energy = 0.5 - dof_vel = 0.0 - torques = 0.025 - - class termination_weight: - termination = 0.0 - - class algorithm(FixedRobotCfgPPO.algorithm): - initial_fill = 0 - storage_size = 100 * 1024 # steps_per_episode * num_envs - batch_size = 1024 # 4096 - max_gradient_steps = 10 # 10 # SB3: 1 - action_max = 2.0 - action_min = -2.0 - actor_noise_std = 1.0 - log_std_max = 4.0 - log_std_min = -20.0 - alpha = 0.2 - target_entropy = -1.0 - max_grad_norm = 1.0 - polyak = 0.995 # flipped compared to stable-baselines3 (polyak == 1-tau) - gamma = 0.99 + class algorithm(PendulumSACRunnerCfg.algorithm): alpha_lr = 1e-4 - actor_lr = 1e-3 - critic_lr = 5e-4 + actor_lr = 1e-4 + critic_lr = 1e-4 - class runner(FixedRobotCfgPPO.runner): + class runner(PendulumSACRunnerCfg.runner): run_name = "" - experiment_name = "sac_pendulum" - max_iterations = 5_000 # number of policy updates + experiment_name = "psd_pendulum" algorithm_class_name = "SAC" num_steps_per_env = 1 save_interval = 500 diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index b094ee91..0125cee4 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -37,17 +37,10 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor(FixedRobotCfgPPO.actor): - hidden_dims = { - "latent": [128, 64], - "mean": [32], - "std": [32], - } - # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = { - "latent": "elu", - "mean": "elu", - "std": "elu", - } + latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True} + mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} + std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} + nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn} normalize_obs = False obs = [ diff --git a/learning/runners/psd_sac_runner.py b/learning/runners/psd_sac_runner.py index 0ba89104..57b411a1 100644 --- a/learning/runners/psd_sac_runner.py +++ b/learning/runners/psd_sac_runner.py @@ -7,6 +7,7 @@ from learning.runners import OffPolicyRunner from learning.modules import Critic, ChimeraActor from learning.modules.QRCritics import * # noqa F401 +from learning.modules.TaylorCritics import * # noqa F401 from learning.storage import ReplayBuffer from learning.algorithms import SAC diff --git a/scripts/train.py b/scripts/train.py index a2535408..759992cf 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -31,4 +31,3 @@ def train(train_cfg, policy_runner): if __name__ == "__main__": train_cfg, policy_runner = setup() train(train_cfg=train_cfg, policy_runner=policy_runner) - print(f"Final value offset: {policy_runner.alg.critic.value_offset.item()}") From f105c0eabc784fa5f93e487f29a78fe752d3af47 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 09:31:47 -0400 Subject: [PATCH 92/98] update nn_params --- gym/envs/pendulum/pendulum_SAC_config.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 0125cee4..836a6185 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -43,10 +43,7 @@ class actor(FixedRobotCfgPPO.actor): nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn} normalize_obs = False - obs = [ - "dof_pos_obs", - "dof_vel", - ] + obs = ["dof_pos_obs", "dof_vel"] actions = ["tau_ff"] disable_actions = False From c04b563f03a540a599fdee1522667e1eb4b7dda9 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 14:06:18 -0400 Subject: [PATCH 93/98] apply stash w/ sim-freq reward sampling. Adjust some reward weights of action_rate (mini-cheetah) to work decently. --- gym/envs/base/task_skeleton.py | 8 +- .../mini_cheetah/mini_cheetah_osc_config.py | 7 +- gym/envs/mini_cheetah/mini_cheetah_ref.py | 9 ++ .../mini_cheetah/mini_cheetah_ref_config.py | 9 +- gym/envs/mit_humanoid/lander_config.py | 2 +- gym/utils/task_registry.py | 3 + learning/runners/my_runner.py | 57 +++++++---- learning/runners/off_policy_runner.py | 95 ++++++++++++------- learning/runners/on_policy_runner.py | 69 +++++++++----- 9 files changed, 178 insertions(+), 81 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index 0974970a..4f931eab 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -5,10 +5,14 @@ class TaskSkeleton: - def __init__(self, num_envs=1, max_episode_length=1.0, device="cpu"): + def __init__( + self, num_envs=1, max_episode_length=1.0, device="cpu", auto_reset=True + ): self.num_envs = num_envs self.max_episode_length = max_episode_length self.device = device + self.auto_reset = auto_reset + return None def get_states(self, obs_list): @@ -70,7 +74,7 @@ def _check_terminations_and_timeouts(self): contact_forces = self.contact_forces[:, self.termination_contact_indices, :] self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) self.timed_out = self.episode_length_buf >= self.max_episode_length - self.to_be_reset = self.timed_out | self.terminated + # self.to_be_reset = self.timed_out | self.terminated def step(self, actions): raise NotImplementedError diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index ec283033..34996404 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -64,7 +64,7 @@ class control(MiniCheetahCfg.control): # * PD Drive parameters: stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 class osc: @@ -163,6 +163,7 @@ class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): runner_class_name = "OnPolicyRunner" class actor(MiniCheetahRunnerCfg.actor): + frequency = 100 hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -221,8 +222,8 @@ class weights: dof_vel = 0.0 min_base_height = 1.5 collision = 0 - action_rate = 0.1 # -0.01 - action_rate2 = 0.01 # -0.001 + action_rate = 0.001 # -0.01 + action_rate2 = 0.0001 # -0.001 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 57e1f0e7..d3f8a89b 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -50,6 +50,15 @@ def _resample_commands(self, env_ids): ).squeeze(1) self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1) + def _check_terminations_and_timeouts(self): + """Check if environments need to be reset""" + contact_forces = self.contact_forces[:, self.termination_contact_indices, :] + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.timed_out = self.episode_length_buf >= self.max_episode_length + # self.to_be_reset = self.timed_out | self.terminated + + # --- + def _switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) return torch.exp( diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 1803c765..b8a7a2fe 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -26,7 +26,7 @@ class control(MiniCheetahCfg.control): stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} gait_freq = 3.0 - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 class commands(MiniCheetahCfg.commands): @@ -70,6 +70,7 @@ class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): runner_class_name = "OnPolicyRunner" class actor(MiniCheetahRunnerCfg.actor): + frequency = 100 hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -122,12 +123,12 @@ class weights: lin_vel_z = 0.0 ang_vel_xy = 0.01 orientation = 1.0 - torques = 5.0e-7 + torques = 5.0e-8 dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 0.1 - action_rate2 = 0.01 + action_rate = 1e-4 + action_rate2 = 1e-5 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index 48c2eb53..86f3329f 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -282,7 +282,7 @@ class weights: power = 1e-6 # 1.0e-2 min_base_height = 1.5 lin_vel_xy = 1.0 - action_rate = 1e-3 + action_rate = 1e-2 action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 diff --git a/gym/utils/task_registry.py b/gym/utils/task_registry.py index 6c2025ba..cc0787ca 100644 --- a/gym/utils/task_registry.py +++ b/gym/utils/task_registry.py @@ -178,6 +178,9 @@ def set_control_and_sim_dt(self, env_cfg, train_cfg): f" to {env_cfg.sim_dt}." ) + if not hasattr(train_cfg.actor, "frequency"): + train_cfg.actor.frequency = env_cfg.control.ctrl_frequency + def set_discount_rates(self, train_cfg, dt): if hasattr(train_cfg.algorithm, "discount_horizon"): hrzn = train_cfg.algorithm.discount_horizon diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 15192d2d..4c2f5d52 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -39,8 +39,7 @@ def learn(self, states_to_log_dict=None): n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" - rewards_dict = {} - rewards_list = [dict() for x in range(n_policy_steps)] + rewards_dict = self.initialize_rewards_dict(n_policy_steps) self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) @@ -104,31 +103,21 @@ def learn(self, states_to_log_dict=None): for step in range(n_policy_steps): self.env.step() # put reward integration here - self.update_rewards(rewards_list[step], self.env.terminated) - # handle time-outs/terminations happening inside here with mask + self.update_rewards_dict(rewards_dict, step) else: # catch and reset failed envs to_be_reset = self.env.timed_out | self.env.terminated env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() self.env._reset_idx(env_ids) + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - - self.update_rewards(rewards_dict, self.env.terminated) - - # integrate rewards from list - total_rewards = torch.stack( - tuple( - torch.stack(tuple(rewards_dict.values())) - for rewards_dict in rewards_list - ) - ).sum(dim=(0, 1)) - transition.update( { "next_actor_obs": actor_obs, @@ -140,7 +129,7 @@ def learn(self, states_to_log_dict=None): ) storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") @@ -205,6 +194,40 @@ def update_rewards(self, rewards_dict, terminated): ) ) + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device + ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, + ) + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~self.env.terminated, + ), + inplace=True, + ) + def sim_and_log_states(self, states_to_log_dict, it_idx): # Simulate environment for as many steps as expected in the dict. # Log states to the dict, as well as whether the env terminated. diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 7aed58e7..516c4a3e 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -48,7 +48,10 @@ def _set_up_alg(self): def learn(self): self.set_up_logger() - rewards_dict = {} + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + + rewards_dict = self.initialize_rewards_dict(n_policy_steps) self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) @@ -94,28 +97,32 @@ def learn(self): } ) - self.env.step() + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) - transition.update( { "next_actor_obs": actor_obs, "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) @@ -147,35 +154,39 @@ def learn(self): } ) - self.env.step() + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) - transition.update( { "next_actor_obs": actor_obs, "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") @@ -192,20 +203,40 @@ def learn(self): self.save() self.save() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( self.get_rewards( - self.critic_cfg["reward"]["termination_weight"], mask=terminated - ) + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, ) - rewards_dict.update( + rewards_dict[step].update( self.get_rewards( self.critic_cfg["reward"]["weights"], modifier=self.env.dt, - mask=~terminated, - ) + mask=~self.env.terminated, + ), + inplace=True, ) + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device + ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + def set_up_logger(self): logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 0a81680d..7167f03e 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -24,7 +24,10 @@ def __init__(self, env, train_cfg, device="cpu"): def learn(self, states_to_log_dict=None): self.set_up_logger() - rewards_dict = {} + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + + rewards_dict = self.initialize_rewards_dict(n_policy_steps) self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) @@ -86,37 +89,39 @@ def learn(self, states_to_log_dict=None): "critic_obs": critic_obs, } ) + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) - self.env.step() + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) - transition.update( { "next_actor_obs": actor_obs, "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "terminated": terminated, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") @@ -149,19 +154,39 @@ def burn_in_normalization(self, n_iterations=100): self.alg.critic.evaluate(critic_obs) self.env.reset() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( self.get_rewards( - self.critic_cfg["reward"]["termination_weight"], mask=terminated - ) + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, ) - rewards_dict.update( + rewards_dict[step].update( self.get_rewards( self.critic_cfg["reward"]["weights"], modifier=self.env.dt, - mask=~terminated, - ) + mask=~self.env.terminated, + ), + inplace=True, + ) + + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict def set_up_logger(self): logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) From e3956738f7ae709874a889b7868abd5fc38dd3f3 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 14:23:38 -0400 Subject: [PATCH 94/98] refactor skeleton with a super_init fix, and pre-initialize reward buffer. Doesn't seem to make a difference in speed. --- gym/envs/base/base_task.py | 19 +++---------------- gym/envs/base/task_skeleton.py | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/base_task.py b/gym/envs/base/base_task.py index 551143a3..033e0eeb 100644 --- a/gym/envs/base/base_task.py +++ b/gym/envs/base/base_task.py @@ -18,33 +18,20 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): # * env device is GPU only if sim is on GPU and use_gpu_pipeline=True, # * otherwise returned tensors are copied to CPU by physX. if sim_device_type == "cuda" and sim_params.use_gpu_pipeline: - self.device = self.sim_device + device = self.sim_device else: - self.device = "cpu" + device = "cpu" # * graphics device for rendering, -1 for no rendering self.graphics_device_id = self.sim_device_id - self.num_envs = cfg.env.num_envs self.num_actuators = cfg.env.num_actuators # * optimization flags for pytorch JIT torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) - # allocate buffers - self.to_be_reset = torch.ones( - self.num_envs, device=self.device, dtype=torch.bool - ) - self.terminated = torch.ones( - self.num_envs, device=self.device, dtype=torch.bool - ) - self.episode_length_buf = torch.zeros( - self.num_envs, device=self.device, dtype=torch.long - ) - self.timed_out = torch.zeros( - self.num_envs, device=self.device, dtype=torch.bool - ) + super().__init__(num_envs=cfg.env.num_envs, device=device) # todo: read from config self.enable_viewer_sync = True diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index 4f931eab..d3585e45 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -5,13 +5,15 @@ class TaskSkeleton: - def __init__( - self, num_envs=1, max_episode_length=1.0, device="cpu", auto_reset=True - ): + def __init__(self, num_envs=1, device="cpu"): self.num_envs = num_envs - self.max_episode_length = max_episode_length self.device = device - self.auto_reset = auto_reset + + self.to_be_reset = torch.ones(num_envs, device=device, dtype=torch.bool) + self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool) + self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long) + self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool) + self.reward_buf = torch.zeros(num_envs, device=device, dtype=torch.float) return None @@ -61,10 +63,11 @@ def compute_reward(self, reward_weights): reward_weights: dict with keys matching reward names, and values matching weights """ - reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) + # reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) + self.reward_buf[:] = 0 for name, weight in reward_weights.items(): - reward += weight * self._eval_reward(name) - return reward + self.reward_buf += weight * self._eval_reward(name) + return self.reward_buf def _eval_reward(self, name): return eval("self._reward_" + name + "()") From ea5cdff304ef6edc78dbd66e4ecfcd9d7d4cccb1 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 15:09:09 -0400 Subject: [PATCH 95/98] refactor: redo rewards computation, with a dict of reward functions in the runner. Seems to make a tiny difference in speed. More importantly, it removes the `eval` hack, which means we can profile individual reward functions. THIS BREAKS PBRS --- gym/envs/base/task_skeleton.py | 15 --------------- learning/runners/BaseRunner.py | 15 +++++++++++---- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index d3585e45..4dfcb280 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -13,7 +13,6 @@ def __init__(self, num_envs=1, device="cpu"): self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool) self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long) self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool) - self.reward_buf = torch.zeros(num_envs, device=device, dtype=torch.float) return None @@ -58,20 +57,6 @@ def _reset_buffers(self): self.terminated[:] = False self.timed_out[:] = False - def compute_reward(self, reward_weights): - """Compute and return a torch tensor of rewards - reward_weights: dict with keys matching reward names, and values - matching weights - """ - # reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) - self.reward_buf[:] = 0 - for name, weight in reward_weights.items(): - self.reward_buf += weight * self._eval_reward(name) - return self.reward_buf - - def _eval_reward(self, name): - return eval("self._reward_" + name + "()") - def _check_terminations_and_timeouts(self): """Check if environments need to be reset""" contact_forces = self.contact_forces[:, self.termination_contact_indices, :] diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 01a1b825..1545293b 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -8,8 +8,8 @@ class BaseRunner: def __init__(self, env, train_cfg, device="cpu"): self.device = device self.env = env + self.setup_reward_functions() self.parse_train_cfg(train_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] @@ -18,6 +18,13 @@ def __init__(self, env, train_cfg, device="cpu"): self.log_dir = train_cfg["log_dir"] self._set_up_alg() + def setup_reward_functions(self): + self.reward_functions = { + method.replace("_reward_", ""): getattr(self.env, method) + for method in dir(self.env) + if callable(getattr(self.env, method)) and method.startswith("_reward_") + } + def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) @@ -95,8 +102,8 @@ def get_rewards(self, reward_weights, modifier=1, mask=None): if mask is None: mask = 1.0 for name, weight in reward_weights.items(): - rewards_dict[name] = mask * self._get_reward({name: weight}, modifier) + rewards_dict[name] = mask * self._get_reward(name, weight * modifier) return rewards_dict - def _get_reward(self, name_weight, modifier=1): - return modifier * self.env.compute_reward(name_weight).to(self.device) + def _get_reward(self, name, weight): + return weight * self.reward_functions[name]().to(self.device) From 7296c6c48903a5458c2e3d4e2c50e72bf430d97a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 15:58:57 -0400 Subject: [PATCH 96/98] compute switch once per decimation (speedup ~10%) --- gym/envs/mini_cheetah/mini_cheetah_osc.py | 20 +++++++++++++------ gym/envs/mini_cheetah/mini_cheetah_ref.py | 19 ++++++++++-------- .../mini_cheetah/mini_cheetah_ref_config.py | 1 + 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py index acec0b8a..6d44e35d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc.py @@ -16,6 +16,9 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + + self._switch = torch.zeros(self.num_envs, 1, device=self.device) + self.oscillators = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_obs = torch.zeros(self.num_envs, 8, device=self.device) @@ -158,6 +161,7 @@ def _post_decimation_step(self): """Update all states that are not handled in PhysX""" super()._post_decimation_step() self.grf = self._compute_grf() + self._update_cmd_switch() # self._step_oscillators() def _post_physx_step(self): @@ -266,12 +270,16 @@ def _compute_grf(self, grf_norm=True): else: return grf - def _switch(self): + def _update_cmd_switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( + self._switch = torch.exp( -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / self.cfg.reward_settings.switch_scale ) + # return torch.exp( + # -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + # / self.cfg.reward_settings.switch_scale + # ) def _reward_cursorial(self): # penalize the abad joints being away from 0 @@ -357,10 +365,10 @@ def _reward_stance_velocity(self): return -torch.mean(rew, dim=1) def _reward_dof_vel(self): - return super()._reward_dof_vel() * self._switch() + return super()._reward_dof_vel() * self._switch def _reward_dof_near_home(self): - return super()._reward_dof_near_home() * self._switch() + return super()._reward_dof_near_home() * self._switch def _reward_stand_still(self): """Penalize motion at zero commands""" @@ -371,11 +379,11 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return (rew_vel + rew_pos - rew_base_vel) * self._switch def _reward_standing_torques(self): """Penalize torques at zero commands""" - return super()._reward_torques() * self._switch() + return super()._reward_torques() * self._switch # * gait similarity scores def angle_difference(self, theta1, theta2): diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index d3f8a89b..0084a80c 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -17,6 +17,7 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + self._switch = torch.zeros(self.num_envs, 1, device=self.device) self.phase = torch.zeros( self.num_envs, 1, dtype=torch.float, device=self.device ) @@ -41,6 +42,7 @@ def _post_decimation_step(self): self.phase_obs = torch.cat( (torch.sin(self.phase), torch.cos(self.phase)), dim=1 ) + self._update_cmd_switch() def _resample_commands(self, env_ids): super()._resample_commands(env_ids) @@ -59,10 +61,11 @@ def _check_terminations_and_timeouts(self): # --- - def _switch(self): + def _update_cmd_switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 + self._switch = torch.exp( + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + / self.cfg.reward_settings.switch_scale ) def _reward_swing_grf(self): @@ -73,7 +76,7 @@ def _reward_swing_grf(self): ) ph_off = torch.lt(self.phase, torch.pi) rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return -torch.sum(rew.float(), dim=1) * (1 - self._switch) def _reward_stance_grf(self): """Reward non-zero grf during stance (pi to 2pi)""" @@ -84,7 +87,7 @@ def _reward_stance_grf(self): ph_off = torch.gt(self.phase, torch.pi) # should this be in swing? rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return torch.sum(rew.float(), dim=1) * (1 - self._switch) def _reward_reference_traj(self): """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE""" @@ -93,7 +96,7 @@ def _reward_reference_traj(self): error /= self.scales["dof_pos"] reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1) # * only when commanded velocity is higher - return reward * (1 - self._switch()) + return reward * (1 - self._switch) def _get_ref(self): leg_frame = torch.zeros_like(self.torques) @@ -121,10 +124,10 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return (rew_vel + rew_pos - rew_base_vel) * self._switch def _reward_tracking_lin_vel(self): """Tracking linear velocity commands (xy axes)""" # just use lin_vel? reward = super()._reward_tracking_lin_vel() - return reward * (1 - self._switch()) + return reward * (1 - self._switch) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index b8a7a2fe..7a54bb1d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -60,6 +60,7 @@ class reward_settings(MiniCheetahCfg.reward_settings): max_contact_force = 600.0 base_height_target = 0.3 tracking_sigma = 0.25 + switch_scale = 0.1 class scaling(MiniCheetahCfg.scaling): pass From 69218dd1a7ffcd1c8a898a160f1bb6f17a75e888 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 10 Sep 2024 17:55:53 -0400 Subject: [PATCH 97/98] fixed logging bug that scaled with wrong dt --- gym/envs/base/fixed_robot.py | 21 ++++-------- gym/envs/base/legged_robot.py | 20 ++++------- gym/envs/base/legged_robot_config.py | 3 -- .../mini_cheetah/mini_cheetah_ref_config.py | 33 ++++++++++++++----- gym/envs/pendulum/pendulum_SAC_config.py | 3 +- gym/envs/pendulum/pendulum_config.py | 1 + learning/runners/my_runner.py | 17 +++++----- learning/runners/off_policy_runner.py | 17 +++++----- learning/runners/on_policy_runner.py | 17 +++++----- 9 files changed, 64 insertions(+), 68 deletions(-) diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 544eba52..f623e5d1 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -580,27 +580,18 @@ def _reward_dof_vel(self): def _reward_action_rate(self): """Penalize changes in actions""" nact = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :nact] - - self.dof_pos_history[:, nact : 2 * nact] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :nact] - self.dof_pos_history[:, 2 * nact :] ) return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" nact = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :nact] - - 2 * self.dof_pos_history[:, nact : 2 * nact] - + self.dof_pos_history[:, 2 * nact :] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :nact] + - 2 * self.dof_pos_history[:, nact : 2 * nact] + + self.dof_pos_history[:, 2 * nact :] ) return -torch.mean(error, dim=1) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index 4433d4fb..c80ebc85 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -1030,26 +1030,18 @@ def _reward_dof_vel(self): def _reward_action_rate(self): """Penalize changes in actions""" n = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :n] - self.dof_pos_history[:, n : 2 * n] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :n] - self.dof_pos_history[:, 2 * n :] ) return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" n = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :n] - - 2 * self.dof_pos_history[:, n : 2 * n] - + self.dof_pos_history[:, 2 * n :] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :n] + - 2 * self.dof_pos_history[:, n : 2 * n] + + self.dof_pos_history[:, 2 * n :] ) return -torch.mean(error, dim=1) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 4336a749..c38f3e33 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -301,9 +301,6 @@ class algorithm: batch_size = 2**15 max_gradient_steps = 24 # new - storage_size = 2**17 # new - batch_size = 2**15 # new - clip_param = 0.2 learning_rate = 1.0e-3 max_grad_norm = 1.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7a54bb1d..d3a301d3 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -85,7 +85,7 @@ class actor(MiniCheetahRunnerCfg.actor): "dof_vel", "phase_obs", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -115,7 +115,7 @@ class critic(MiniCheetahRunnerCfg.critic): "phase_obs", "dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: @@ -128,13 +128,13 @@ class weights: dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 1e-4 - action_rate2 = 1e-5 + action_rate = 1e-5 + action_rate2 = 1e-6 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 - reference_traj = 1.5 + reference_traj = 0.0 swing_grf = 1.5 stance_grf = 1.5 @@ -142,11 +142,28 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - pass + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2 * 4096 # use all the data + max_gradient_steps = 50 + + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 1000 # number of policy updates + max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 # deprecate + num_steps_per_env = 20 # deprecate diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 836a6185..42bf0265 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -17,7 +17,7 @@ class init_state(PendulumCfg.init_state): dof_vel_range = {"theta": [-5, 5]} class control(PendulumCfg.control): - ctrl_frequency = 10 + ctrl_frequency = 100 desired_sim_frequency = 100 class asset(PendulumCfg.asset): @@ -37,6 +37,7 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor(FixedRobotCfgPPO.actor): + frequency = 10 latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True} mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index bb33c1f1..910fe341 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -62,6 +62,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OnPolicyRunner" class actor(FixedRobotCfgPPO.actor): + frequency = 25 hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 4c2f5d52..e0da1a1b 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -16,12 +16,6 @@ class MyRunner(OnPolicyRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) @@ -34,10 +28,9 @@ def _set_up_alg(self): self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def learn(self, states_to_log_dict=None): - self.set_up_logger() - n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) rewards_dict = self.initialize_rewards_dict(n_policy_steps) @@ -164,7 +157,13 @@ def burn_in_normalization(self, n_iterations=100): self.alg.critic.evaluate(critic_obs) self.env.reset() - def set_up_logger(self): + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( list(self.critic_cfg["reward"]["termination_weight"].keys()) diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 516c4a3e..83249aef 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -16,12 +16,6 @@ class OffPolicyRunner(BaseRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) @@ -46,10 +40,9 @@ def _set_up_alg(self): ) def learn(self): - self.set_up_logger() - n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) rewards_dict = self.initialize_rewards_dict(n_policy_steps) @@ -237,7 +230,13 @@ def initialize_rewards_dict(self, n_steps): ) return rewards_dict - def set_up_logger(self): + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( list(self.critic_cfg["reward"]["termination_weight"].keys()) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 7167f03e..4831b615 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -14,18 +14,11 @@ class OnPolicyRunner(BaseRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) def learn(self, states_to_log_dict=None): - self.set_up_logger() - n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) rewards_dict = self.initialize_rewards_dict(n_policy_steps) @@ -188,7 +181,13 @@ def initialize_rewards_dict(self, n_steps): ) return rewards_dict - def set_up_logger(self): + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( list(self.critic_cfg["reward"]["termination_weight"].keys()) From b1bd4f2b9b20c2c67877d61e8c4daee513d65b17 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 11 Sep 2024 14:37:52 -0400 Subject: [PATCH 98/98] hardcode Jacobian (halves time for _apply_coupling) --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 ++ gym/envs/mit_humanoid/mit_humanoid.py | 28 ++++++------ gym/envs/mit_humanoid/mit_humanoid_config.py | 43 +++++++++---------- learning/modules/QRCritics.py | 2 +- 4 files changed, 39 insertions(+), 38 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index d3a301d3..37c52919 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,6 +74,7 @@ class actor(MiniCheetahRunnerCfg.actor): frequency = 100 hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + layer_norm = [True, True, False] activation = "elu" smooth_exploration = False exploration_sample_freq = 16 @@ -102,6 +103,9 @@ class noise: class critic(MiniCheetahRunnerCfg.critic): hidden_dims = [256, 256, 128] + layer_norm = [True, True, False] + dropouts = [0.1, 0.0, 0.0] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" obs = [ diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index a580fd57..a2e082e8 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -32,29 +32,28 @@ def _init_sampled_history_buffers(self): self.sampled_history_threshold = int( self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency ) + self.J = torch.eye(self.num_dof).to(self.device) + self.J[4, 3] = 1 + self.J[9, 8] = 1 + self.J_inv_T = torch.inverse(self.J.T) def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): # Create a Jacobian matrix and move it to the same device as input tensors - J = torch.eye(q.shape[-1]).to(q.device) - J[4, 3] = 1 - J[9, 8] = 1 # Perform transformations using Jacobian - q = torch.matmul(q, J.T) - qd = torch.matmul(qd, J.T) - q_des = torch.matmul(q_des, J.T) - qd_des = torch.matmul(qd_des, J.T) - - # Inverse of the transpose of Jacobian - J_inv_T = torch.inverse(J.T) + q = torch.matmul(q, self.J.T) + qd = torch.matmul(qd, self.J.T) + q_des = torch.matmul(q_des, self.J.T) + qd_des = torch.matmul(qd_des, self.J.T) # Compute feed-forward torques - tau_ff = torch.matmul(J_inv_T, tau_ff.T).T + tau_ff = torch.matmul(self.J_inv_T, tau_ff.T).T # Compute kp and kd terms kp = torch.diagonal( torch.matmul( - torch.matmul(J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), J_inv_T.T + torch.matmul(self.J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), + self.J_inv_T.T, ), dim1=-2, dim2=-1, @@ -62,7 +61,8 @@ def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): kd = torch.diagonal( torch.matmul( - torch.matmul(J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), J_inv_T.T + torch.matmul(self.J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), + self.J_inv_T.T, ), dim1=-2, dim2=-1, @@ -70,7 +70,7 @@ def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): # Compute torques torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff - torques = torch.matmul(torques, J) + torques = torch.matmul(torques, self.J) return torques diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index dc02a211..861f2826 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -112,7 +112,7 @@ class control(LeggedRobotCfg.control): } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 1000 + desired_sim_frequency = 500 filter_gain = 0.1586 # 1: no filtering, 0: wall @@ -123,7 +123,7 @@ class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] + lin_vel_x = [-0.0, 4.0] # min max [m/s] [-0.75, 0.75] lin_vel_y = 0.0 # max [m/s] yaw_vel = 0.0 # max [rad/s] @@ -227,10 +227,12 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): runner_class_name = "OnPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): + frequency = 100 init_noise_std = 1.0 hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + layer_norm = [True, True, False] smooth_exploration = False obs = [ @@ -238,16 +240,15 @@ class actor(LeggedRobotRunnerCfg.actor): "base_lin_vel", "base_ang_vel", "projected_gravity", - # "commands", + "commands", "dof_pos_obs", "dof_vel", - "dof_pos_history", - # "sampled_history_dof_pos", - # "sampled_history_dof_vel", - # "sampled_history_dof_pos_target", - "oscillator_obs", + # "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -264,26 +265,28 @@ class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + layer_norm = [True, True, False] obs = [ "base_height", "base_lin_vel", "base_ang_vel", "projected_gravity", - "commands", "dof_pos_obs", "dof_vel", - "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: - # tracking_lin_vel = 0.0 - # tracking_ang_vel = 1.5 + tracking_lin_vel = 4.0 + tracking_ang_vel = 0.5 # orientation = 1.0 torques = 5.0e-4 - # min_base_height = 1.5 + min_base_height = 1.5 action_rate = 1e-3 action_rate2 = 1e-3 lin_vel_z = 0.0 @@ -292,8 +295,6 @@ class weights: # stand_still = 0.25 dof_pos_limits = 0.25 dof_near_home = 0.25 - stance = 1.0 - swing = 1.0 hips_forward = 0.0 walk_freq = 0.0 # 2.5 @@ -305,12 +306,8 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): gamma = 0.99 lam = 0.95 # shared - batch_size = 2**15 - max_gradient_steps = 24 - # new - storage_size = 2**17 # new - batch_size = 2**15 # new - + batch_size = 4096 + max_gradient_steps = 48 clip_param = 0.2 learning_rate = 1.0e-3 max_grad_norm = 1.0 diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py index d2ed9378..9dd51489 100644 --- a/learning/modules/QRCritics.py +++ b/learning/modules/QRCritics.py @@ -19,7 +19,7 @@ def init_weights(m): m.bias.data.fill_(0.01) -class Critic(nn.Module): +class Critic2(nn.Module): def __init__( self, num_obs,