From e19ec214976a2d4d34b52ad8a8fc6e43c1f1a833 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 16 Feb 2024 13:30:46 -0500 Subject: [PATCH 01/69] add new PPO2, for now just split up actor and critic. Tested, 100% the same as PPO. --- learning/algorithms/__init__.py | 1 + learning/algorithms/ppo2.py | 232 ++++++++++++++++++++++++++++++++ learning/runners/BaseRunner.py | 2 +- 3 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 learning/algorithms/ppo2.py diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index 7827379b..ace1b9fe 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -31,4 +31,5 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .ppo import PPO +from .ppo2 import PPO2 from .SE import StateEstimator diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py new file mode 100644 index 00000000..6649f396 --- /dev/null +++ b/learning/algorithms/ppo2.py @@ -0,0 +1,232 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2021 ETH Zurich, Nikita Rudin + +import torch +import torch.nn as nn +import torch.optim as optim + +# from learning.modules import ActorCritic +from learning.storage import RolloutStorage + + +class PPO2: + # actor_critic: ActorCritic + + def __init__( + self, + actor_critic, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + value_loss_coef=1.0, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + # todo keeping actor_critic for loading code in runner + self.actor_critic = actor_critic.to(self.device) + self.actor = actor_critic.actor.to(self.device) + self.critic = actor_critic.critic.to(self.device) + self.storage = None # initialized later + parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.optimizer = optim.Adam(parameters, lr=learning_rate) + self.transition = RolloutStorage.Transition() + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.value_loss_coef = value_loss_coef + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def init_storage( + self, + num_envs, + num_transitions_per_env, + actor_obs_shape, + critic_obs_shape, + action_shape, + ): + self.storage = RolloutStorage( + num_envs, + num_transitions_per_env, + actor_obs_shape, + critic_obs_shape, + action_shape, + self.device, + ) + + def test_mode(self): + self.actor.test() + self.critic.test() + + def train_mode(self): + self.actor.train() + self.critic.train() + + def act(self, obs, critic_obs): + # * Compute the actions and values + self.transition.actions = self.actor.act(obs).detach() + self.transition.values = self.critic.evaluate(critic_obs).detach() + self.transition.actions_log_prob = self.actor.get_actions_log_prob( + self.transition.actions + ).detach() + self.transition.action_mean = self.actor.action_mean.detach() + self.transition.action_sigma = self.actor.action_std.detach() + # * need to record obs and critic_obs before env.step() + self.transition.observations = obs + self.transition.critic_observations = critic_obs + return self.transition.actions + + def process_env_step(self, rewards, dones, timed_out=None): + self.transition.rewards = rewards.clone() + self.transition.dones = dones + # * Bootstrapping on time outs + if timed_out is not None: + self.transition.rewards += self.gamma * self.transition.values * timed_out + + # * Record the transition + self.storage.add_transitions(self.transition) + self.transition.clear() + + def compute_returns(self, last_critic_obs): + last_values = self.critic.evaluate(last_critic_obs).detach() + self.storage.compute_returns(last_values, self.gamma, self.lam) + + def update(self): + self.mean_value_loss = 0 + self.mean_surrogate_loss = 0 + generator = self.storage.mini_batch_generator( + self.num_mini_batches, self.num_learning_epochs + ) + for ( + obs_batch, + critic_obs_batch, + actions_batch, + target_values_batch, + advantages_batch, + returns_batch, + old_actions_log_prob_batch, + old_mu_batch, + old_sigma_batch, + ) in generator: + self.actor.act(obs_batch) + actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) + value_batch = self.critic.evaluate(critic_obs_batch) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / old_sigma_batch + 1.0e-5) + + ( + torch.square(old_sigma_batch) + + torch.square(old_mu_batch - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + for param_group in self.optimizer.param_groups: + param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch - torch.squeeze(old_actions_log_prob_batch) + ) + surrogate = -torch.squeeze(advantages_batch) * ratio + surrogate_clipped = -torch.squeeze(advantages_batch) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + # * Value function loss + if self.use_clipped_value_loss: + value_clipped = target_values_batch + ( + value_batch - target_values_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (returns_batch - value_batch).pow(2).mean() + + loss = ( + surrogate_loss + + self.value_loss_coef * value_loss + - self.entropy_coef * entropy_batch.mean() + ) + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_( + list(self.actor.parameters()) + list(self.critic.parameters()), + self.max_grad_norm, + ) + self.optimizer.step() + + self.mean_value_loss += value_loss.item() + self.mean_surrogate_loss += surrogate_loss.item() + + num_updates = self.num_learning_epochs * self.num_mini_batches + self.mean_value_loss /= num_updates + self.mean_surrogate_loss /= num_updates + self.storage.clear() diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 627ba58f..5c1cf798 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,5 +1,5 @@ import torch -from learning.algorithms import PPO +from learning.algorithms import * # noqa from learning.modules import ActorCritic from learning.env import VecEnv from learning.utils import remove_zero_weighted_rewards From 1a7977e5e5b3fcb67eb8f620e5d460d6af0fc848 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 16 Feb 2024 17:33:04 -0500 Subject: [PATCH 02/69] PPO2: split optimizers for critic and actor --- gym/envs/mini_cheetah/mini_cheetah_config.py | 2 +- .../mini_cheetah/mini_cheetah_ref_config.py | 5 +- learning/algorithms/ppo.py | 22 +++++- learning/algorithms/ppo2.py | 74 +++++++++++++++---- learning/storage/rollout_storage.py | 2 +- 5 files changed, 82 insertions(+), 23 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index de015b21..502dc571 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -196,7 +196,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): learning_rate = 1.0e-5 schedule = "adaptive" # can be adaptive or fixed discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 2.0 # [s] + # GAE_bootstrap_horizon = 2.0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41bb634f..867a61db 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -145,7 +145,8 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): learning_rate = 5.0e-5 schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] - GAE_bootstrap_horizon = 1.0 # [s] + lam = 0.95 + # GAE_bootstrap_horizon = 1.0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 @@ -153,5 +154,5 @@ class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 9c7454bd..f492ef0e 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -69,7 +69,12 @@ def __init__( self.actor_critic = actor_critic self.actor_critic.to(self.device) self.storage = None # initialized later - self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate) + self.optimizer = optim.Adam( + self.actor_critic.actor.parameters(), lr=learning_rate + ) + self.crit_optimizer = optim.Adam( + self.actor_critic.critic.parameters(), lr=learning_rate + ) self.transition = RolloutStorage.Transition() # * PPO parameters @@ -156,7 +161,6 @@ def update(self): actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) - value_batch = self.actor_critic.evaluate(critic_obs_batch) mu_batch = self.actor_critic.action_mean sigma_batch = self.actor_critic.action_std entropy_batch = self.actor_critic.entropy @@ -195,6 +199,7 @@ def update(self): surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() # * Value function loss + value_batch = self.actor_critic.evaluate(critic_obs_batch) if self.use_clipped_value_loss: value_clipped = target_values_batch + ( value_batch - target_values_batch @@ -207,16 +212,25 @@ def update(self): loss = ( surrogate_loss - + self.value_loss_coef * value_loss + # + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean() ) # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) + nn.utils.clip_grad_norm_( + self.actor_critic.actor.parameters(), self.max_grad_norm + ) self.optimizer.step() + self.crit_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_( + self.actor_critic.critic.parameters(), self.max_grad_norm + ) + self.crit_optimizer.step() + self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 6649f396..06e7fabd 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -71,8 +71,9 @@ def __init__( self.actor = actor_critic.actor.to(self.device) self.critic = actor_critic.critic.to(self.device) self.storage = None # initialized later - parameters = list(self.actor.parameters()) + list(self.critic.parameters()) - self.optimizer = optim.Adam(parameters, lr=learning_rate) + # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) self.transition = RolloutStorage.Transition() # * PPO parameters @@ -143,6 +144,9 @@ def compute_returns(self, last_critic_obs): def update(self): self.mean_value_loss = 0 self.mean_surrogate_loss = 0 + + self.update_critic() + generator = self.storage.mini_batch_generator( self.num_mini_batches, self.num_learning_epochs ) @@ -159,7 +163,7 @@ def update(self): ) in generator: self.actor.act(obs_batch) actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) - value_batch = self.critic.evaluate(critic_obs_batch) + # value_batch = self.critic.evaluate(critic_obs_batch) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std entropy_batch = self.actor.entropy @@ -198,19 +202,19 @@ def update(self): surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() # * Value function loss - if self.use_clipped_value_loss: - value_clipped = target_values_batch + ( - value_batch - target_values_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (returns_batch - value_batch).pow(2).mean() + # if self.use_clipped_value_loss: + # value_clipped = target_values_batch + ( + # value_batch - target_values_batch + # ).clamp(-self.clip_param, self.clip_param) + # value_losses = (value_batch - returns_batch).pow(2) + # value_losses_clipped = (value_clipped - returns_batch).pow(2) + # value_loss = torch.max(value_losses, value_losses_clipped).mean() + # else: + # value_loss = (returns_batch - value_batch).pow(2).mean() loss = ( surrogate_loss - + self.value_loss_coef * value_loss + # + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean() ) @@ -223,10 +227,50 @@ def update(self): ) self.optimizer.step() - self.mean_value_loss += value_loss.item() + # self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() num_updates = self.num_learning_epochs * self.num_mini_batches - self.mean_value_loss /= num_updates + # self.mean_value_loss /= num_updates self.mean_surrogate_loss /= num_updates self.storage.clear() + + def update_critic(self): + self.mean_value_loss = 0 + counter = 0 + generator = self.storage.mini_batch_generator( + self.num_mini_batches, self.num_learning_epochs + ) + for ( + obs_batch, + critic_obs_batch, + actions_batch, + target_values_batch, + advantages_batch, + returns_batch, + old_actions_log_prob_batch, + old_mu_batch, + old_sigma_batch, + ) in generator: + value_batch = self.critic.evaluate(critic_obs_batch) + if self.use_clipped_value_loss: + value_clipped = target_values_batch + ( + value_batch - target_values_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (returns_batch - value_batch).pow(2).mean() + + # * Gradient Step + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_policy(self): + pass diff --git a/learning/storage/rollout_storage.py b/learning/storage/rollout_storage.py index bba48e02..38a315d7 100644 --- a/learning/storage/rollout_storage.py +++ b/learning/storage/rollout_storage.py @@ -182,7 +182,7 @@ def get_statistics(self): trajectory_lengths = done_indices[1:] - done_indices[:-1] return trajectory_lengths.float().mean(), self.rewards.mean() - def mini_batch_generator(self, num_mini_batches, num_epochs=8): + def mini_batch_generator(self, num_mini_batches=1, num_epochs=8): batch_size = self.num_envs * self.num_transitions_per_env mini_batch_size = batch_size // num_mini_batches indices = torch.randperm( From 3253a659867370c0ce86889677d02485cacde97d Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 20 Feb 2024 09:33:37 -0500 Subject: [PATCH 03/69] cleaning --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +- learning/algorithms/ppo2.py | 115 +++++++----------- learning/runners/BaseRunner.py | 4 +- 3 files changed, 51 insertions(+), 72 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 867a61db..c59f8a56 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -135,7 +135,7 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # training params - value_loss_coef = 1.0 + value_loss_coef = 1.0 # deprecate for PPO2 use_clipped_value_loss = True clip_param = 0.2 entropy_coef = 0.01 @@ -146,7 +146,7 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 - # GAE_bootstrap_horizon = 1.0 # [s] + GAE_bootstrap_horizon = 2.0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 06e7fabd..64857280 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -49,7 +49,6 @@ def __init__( clip_param=0.2, gamma=0.998, lam=0.95, - value_loss_coef=1.0, entropy_coef=0.0, learning_rate=1e-3, max_grad_norm=1.0, @@ -80,7 +79,6 @@ def __init__( self.clip_param = clip_param self.num_learning_epochs = num_learning_epochs self.num_mini_batches = num_mini_batches - self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam @@ -146,24 +144,65 @@ def update(self): self.mean_surrogate_loss = 0 self.update_critic() + self.update_actor() + self.storage.clear() + def update_critic(self): + self.mean_value_loss = 0 + counter = 0 generator = self.storage.mini_batch_generator( self.num_mini_batches, self.num_learning_epochs ) for ( - obs_batch, + _, critic_obs_batch, - actions_batch, + _, target_values_batch, - advantages_batch, + _, returns_batch, + _, + _, + _, + ) in generator: + value_batch = self.critic.evaluate(critic_obs_batch) + if self.use_clipped_value_loss: + value_clipped = target_values_batch + ( + value_batch - target_values_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (returns_batch - value_batch).pow(2).mean() + + # * Gradient Step + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self): + self.mean_surrogate_loss = 0 + counter = 0 + generator = self.storage.mini_batch_generator( + self.num_mini_batches, self.num_learning_epochs + ) + for ( + obs_batch, + _, + actions_batch, + _, + advantages_batch, + _, old_actions_log_prob_batch, old_mu_batch, old_sigma_batch, ) in generator: self.actor.act(obs_batch) actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) - # value_batch = self.critic.evaluate(critic_obs_batch) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std entropy_batch = self.actor.entropy @@ -201,22 +240,7 @@ def update(self): ) surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() - # * Value function loss - # if self.use_clipped_value_loss: - # value_clipped = target_values_batch + ( - # value_batch - target_values_batch - # ).clamp(-self.clip_param, self.clip_param) - # value_losses = (value_batch - returns_batch).pow(2) - # value_losses_clipped = (value_clipped - returns_batch).pow(2) - # value_loss = torch.max(value_losses, value_losses_clipped).mean() - # else: - # value_loss = (returns_batch - value_batch).pow(2).mean() - - loss = ( - surrogate_loss - # + self.value_loss_coef * value_loss - - self.entropy_coef * entropy_batch.mean() - ) + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() # * Gradient step self.optimizer.zero_grad() @@ -226,51 +250,6 @@ def update(self): self.max_grad_norm, ) self.optimizer.step() - - # self.mean_value_loss += value_loss.item() self.mean_surrogate_loss += surrogate_loss.item() - - num_updates = self.num_learning_epochs * self.num_mini_batches - # self.mean_value_loss /= num_updates - self.mean_surrogate_loss /= num_updates - self.storage.clear() - - def update_critic(self): - self.mean_value_loss = 0 - counter = 0 - generator = self.storage.mini_batch_generator( - self.num_mini_batches, self.num_learning_epochs - ) - for ( - obs_batch, - critic_obs_batch, - actions_batch, - target_values_batch, - advantages_batch, - returns_batch, - old_actions_log_prob_batch, - old_mu_batch, - old_sigma_batch, - ) in generator: - value_batch = self.critic.evaluate(critic_obs_batch) - if self.use_clipped_value_loss: - value_clipped = target_values_batch + ( - value_batch - target_values_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (returns_batch - value_batch).pow(2).mean() - - # * Gradient Step - self.critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) - self.critic_optimizer.step() - self.mean_value_loss += value_loss.item() counter += 1 - self.mean_value_loss /= counter - - def update_policy(self): - pass + self.mean_surrogate_loss /= counter diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 5c1cf798..2c8cda65 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,5 +1,5 @@ import torch -from learning.algorithms import * # noqa +from learning.algorithms import * # noqa: F403 from learning.modules import ActorCritic from learning.env import VecEnv from learning.utils import remove_zero_weighted_rewards @@ -19,7 +19,7 @@ def __init__(self, env: VecEnv, train_cfg, device="cpu"): ).to(self.device) alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg: PPO = alg_class(actor_critic, device=self.device, **self.alg_cfg) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] From e3c258971a4db04c2f628b83ab52e945bcc7626e Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 20 Feb 2024 14:17:25 -0500 Subject: [PATCH 04/69] Basic internals of dict_storage in place, fills correctly --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +- learning/algorithms/ppo2.py | 15 ++++--- learning/runners/my_runner.py | 28 +++++++++++- learning/storage/__init__.py | 1 + learning/storage/dict_storage.py | 45 +++++++++++++++++++ 5 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 learning/storage/dict_storage.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index c59f8a56..4ff6ebaa 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - + normalize_obs = False actor_obs = [ "base_ang_vel", "projected_gravity", @@ -142,6 +142,8 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_learning_epochs = 6 # mini batch size = num_envs*nsteps/nminibatches num_mini_batches = 4 + storage_size = 2**17 # new + mini_batch_size = 2**15 # new learning_rate = 5.0e-5 schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 64857280..118920e0 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -153,13 +153,16 @@ def update_critic(self): generator = self.storage.mini_batch_generator( self.num_mini_batches, self.num_learning_epochs ) + + # update storage with values evaluated now + for ( _, critic_obs_batch, _, - target_values_batch, + target_values_batch, # ! eval _, - returns_batch, + returns_batch, # ! calc _, _, _, @@ -195,11 +198,11 @@ def update_actor(self): _, actions_batch, _, - advantages_batch, + advantages_batch, # calc + eval _, - old_actions_log_prob_batch, - old_mu_batch, - old_sigma_batch, + old_actions_log_prob_batch, # eval? + old_mu_batch, # eval? + old_sigma_batch, # eval? ) in generator: self.actor.act(obs_batch) actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 74b842d4..f1f824b8 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -1,4 +1,5 @@ import torch +from tensordict import TensorDict from learning.env import VecEnv from learning.utils import Logger @@ -7,7 +8,10 @@ from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + logger = Logger() +storage = DictStorage() class MyRunner(OnPolicyRunner): @@ -34,9 +38,21 @@ def learn(self): actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - + rewards_dict self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize(transition) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -67,6 +83,16 @@ def learn(self): rewards_dict.update(PBRS.post_step(self.env, dones)) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "actor_obs": actor_obs, + "critic_obs": critic_obs, + "rewards": total_rewards, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index edd7bd55..f10df85f 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,3 +3,4 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage +from .dict_storage import DictStorage \ No newline at end of file diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py new file mode 100644 index 00000000..2b5c8e3b --- /dev/null +++ b/learning/storage/dict_storage.py @@ -0,0 +1,45 @@ +import torch +from tensordict import TensorDict + + +class DictStorage: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.storage = TensorDict({}, batch_size=(num_envs, max_length), device=device) + self.fill_count = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: + self.storage[key] = torch.zeros( + (self.num_envs, self.max_length), + device=self.device, + ) + else: + self.storage[key] = torch.zeros( + (self.num_envs, self.max_length, dummy_dict[key].shape[1]), + device=self.device, + ) + + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length: + raise AssertionError("Rollout buffer overflow") + self.storage[:, self.fill_count] = transition + self.fill_count += 1 + + def clear(self): + self.fill_count = 0 + + def mini_batch_generator(self, keys, mini_batch_size, num_epochs): + pass From 7c1166b9048e1dcd228596a6fa84a89dcee67eaf Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 20 Feb 2024 18:57:19 -0500 Subject: [PATCH 05/69] WIP: change storage to be (n_steps, n_envs, var_dim), among other things --- learning/algorithms/ppo2.py | 32 ----------- learning/runners/my_runner.py | 11 +++- learning/storage/dict_storage.py | 14 ++--- learning/utils/__init__.py | 37 ++----------- learning/utils/utils.py | 93 -------------------------------- 5 files changed, 21 insertions(+), 166 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 118920e0..a5ef5ac4 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -1,35 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - import torch import torch.nn as nn import torch.optim as optim diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index f1f824b8..4a20880d 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -5,6 +5,7 @@ from learning.utils import Logger from learning.utils import PotentialBasedRewardShaping from learning.utils import remove_zero_weighted_rewards +# from learning.utils import compute_MC_returns, compute_generalized_advantages from .on_policy_runner import OnPolicyRunner @@ -46,6 +47,7 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), @@ -86,19 +88,26 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "actions": actions, "critic_obs": critic_obs, "rewards": total_rewards, "dones": dones, } ) - storage.add_transitions(transition) logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) self.alg.process_env_step(total_rewards, dones, timed_out) + storage.add_transitions(transition) + self.alg.compute_returns(critic_obs) + # compute_MC_returns(storage.data, self.alg.gamma) + # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic.eval) + # compute_generalized_advantages( + # storage.data, self.alg.gamma, self.alg.lam, self.alg.critic.eval + # ) logger.toc("collection") logger.tic("learning") diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 2b5c8e3b..135684d3 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -17,25 +17,25 @@ def initialize( self.num_envs = num_envs max_length = max_storage // num_envs self.max_length = max_length - self.storage = TensorDict({}, batch_size=(num_envs, max_length), device=device) + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) self.fill_count = 0 for key in dummy_dict.keys(): - if dummy_dict[key].dim() == 1: - self.storage[key] = torch.zeros( - (self.num_envs, self.max_length), + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), device=self.device, ) else: - self.storage[key] = torch.zeros( - (self.num_envs, self.max_length, dummy_dict[key].shape[1]), + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), device=self.device, ) def add_transitions(self, transition: TensorDict): if self.fill_count >= self.max_length: raise AssertionError("Rollout buffer overflow") - self.storage[:, self.fill_count] = transition + self.data[self.fill_count] = transition self.fill_count += 1 def clear(self): diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index a39a3daf..8559714e 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,40 +1,11 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin from .utils import ( - split_and_pad_trajectories, - unpad_trajectories, remove_zero_weighted_rewards, set_discount_from_horizon, ) +from .dict_utils import ( + compute_generalized_advantages, + compute_MC_returns +) from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/utils.py b/learning/utils/utils.py index b1a20bcf..7a54c65a 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -1,96 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -# THE POSSIBILITY OF SUCH DAMAGE. -# -# Copyright (c) 2021 ETH Zurich, Nikita Rudin - -import torch - - -def split_and_pad_trajectories(tensor, dones): - """ - Splits trajectories at done indices. Then concatenates them and pads - with zeros up to the length og the longest trajectory. - Returns masks corresponding to valid parts of the trajectories - Example: - Input: [ [a1, a2, a3, a4 | a5, a6], - [b1, b2 | b3, b4, b5 | b6] - ] - - Output:[ [a1, a2, a3, a4], | [ [True, True, True, True], - [a5, a6, 0, 0], | [True, True, False, False], - [b1, b2, 0, 0], | [True, True, False, False], - [b3, b4, b5, 0], | [True, True, True, False], - [b6, 0, 0, 0] | [True, False, False, False], - ] | ] - - Assumes that the inputy has the following dimension order: - [time, number of envs, aditional dimensions] - """ - dones = dones.clone() - dones[-1] = 1 - # * Permute the buffers to have order - # * (num_envs, num_transitions_per_env, ...), for correct reshaping - flat_dones = dones.transpose(1, 0).reshape(-1, 1) - - # * Get length of trajectory by counting the number of - # * successive not done elements - done_indices = torch.cat( - ( - flat_dones.new_tensor([-1], dtype=torch.int64), - flat_dones.nonzero()[:, 0], - ) - ) - trajectory_lengths = done_indices[1:] - done_indices[:-1] - trajectory_lengths_list = trajectory_lengths.tolist() - # * Extract the individual trajectories - trajectories = torch.split( - tensor.transpose(1, 0).flatten(0, 1), trajectory_lengths_list - ) - padded_trajectories = torch.nn.utils.rnn.pad_sequence(trajectories) - - trajectory_masks = trajectory_lengths > torch.arange( - 0, tensor.shape[0], device=tensor.device - ).unsqueeze(1) - return padded_trajectories, trajectory_masks - - -def unpad_trajectories(trajectories, masks): - """Does the inverse operation of split_and_pad_trajectories()""" - # * Need to transpose before and after the masking to have proper reshaping - return ( - trajectories.transpose(1, 0)[masks.transpose(1, 0)] - .view(-1, trajectories.shape[0], trajectories.shape[-1]) - .transpose(1, 0) - ) - - def remove_zero_weighted_rewards(reward_weights): for name in list(reward_weights.keys()): if reward_weights[name] == 0: From fa05a31b1efbbd341f1aa8e142485c334a7b601d Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 21 Feb 2024 09:39:33 -0500 Subject: [PATCH 06/69] implemented compute advantages (returns), validated against existing. Same up to 1e-5 on individual values. Because of differencing, this can end up in much larger differences (1e0) occasionally, after normalization. --- learning/runners/my_runner.py | 29 +++++++++++++----- learning/storage/dict_storage.py | 2 ++ learning/utils/dict_utils.py | 52 ++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 learning/utils/dict_utils.py diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 4a20880d..08721b5e 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -5,7 +5,7 @@ from learning.utils import Logger from learning.utils import PotentialBasedRewardShaping from learning.utils import remove_zero_weighted_rewards -# from learning.utils import compute_MC_returns, compute_generalized_advantages +from learning.utils import compute_generalized_advantages from .on_policy_runner import OnPolicyRunner @@ -53,7 +53,7 @@ def learn(self): "dones": self.get_timed_out(), } ) - storage.initialize(transition) + storage.initialize(transition, device=self.device) logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): @@ -69,6 +69,14 @@ def learn(self): self.policy_cfg["disable_actions"], ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + PBRS.pre_step(self.env) self.env.step() @@ -87,9 +95,6 @@ def learn(self): transition.update( { - "actor_obs": actor_obs, - "actions": actions, - "critic_obs": critic_obs, "rewards": total_rewards, "dones": dones, } @@ -104,10 +109,20 @@ def learn(self): self.alg.compute_returns(critic_obs) # compute_MC_returns(storage.data, self.alg.gamma) - # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic.eval) + # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic) # compute_generalized_advantages( - # storage.data, self.alg.gamma, self.alg.lam, self.alg.critic.eval + # storage.data, + # self.alg.gamma, + # self.alg.lam, + # self.alg.critic, # ) + compute_generalized_advantages( + storage.data, + self.alg.gamma, + self.alg.lam, + self.alg.critic, + self.alg.critic.evaluate(critic_obs), + ) logger.toc("collection") logger.tic("learning") diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 135684d3..3155ca28 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -24,11 +24,13 @@ def initialize( if dummy_dict[key].dim() == 1: # if scalar self.data[key] = torch.zeros( (max_length, num_envs), + dtype=dummy_dict[key].dtype, device=self.device, ) else: self.data[key] = torch.zeros( (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, device=self.device, ) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py new file mode 100644 index 00000000..c1b3cb4f --- /dev/null +++ b/learning/utils/dict_utils.py @@ -0,0 +1,52 @@ +import torch +from tensordict import TensorDict + + +def compute_MC_returns(data: TensorDict, gamma, critic=None): + if critic is None: + last_values = torch.zeros_like(data["rewards"][0]) + else: + last_values = critic.evaluate(data["critic_obs"][-1]) + + data.update({"returns": torch.zeros_like(data["rewards"])}) + data["returns"][-1] = last_values * ~data["dones"][-1] + for k in reversed(range(data["rewards"].shape[0] - 1)): + not_done = ~data["dones"][k] + data["returns"][k] = ( + data["rewards"][k] + gamma * data["returns"][k + 1] * not_done + ) + return + + +def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + if "values" not in data.keys(): + data.update({"values": critic.evaluate(data["critic_obs"])}) + + if "advantages" not in data.keys(): + data.update({"advantages": torch.zeros_like(data["values"])}) + else: + data["advantages"].zero_() + + if last_values is not None: + # todo check this + # since we don't have observations for the last step, need last value plugged in + data["advantages"][-1] = ( + data["rewards"][-1] + + gamma * last_values * ~data["dones"][-1] + - data["values"][-1] + ) + + for k in reversed(range(data["values"].shape[0] - 1)): + not_done = ~data["dones"][k] + td_error = ( + data["rewards"][k] + + gamma * data["values"][k + 1] * not_done + - data["values"][k] + ) + data["advantages"][k] = ( + td_error + gamma * lam * not_done * data["advantages"][k + 1] + ) + # data["returns"] = data["advantages"] + data["values"] + data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( + data["advantages"].std() + 1e-8 + ) From 5c5d38b2a5e79f26c4776a6806b5d4d8abb99ff5 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 21 Feb 2024 13:52:55 -0500 Subject: [PATCH 07/69] critic update runs, but is very slow. Not clear why --- learning/algorithms/ppo2.py | 29 +++++++++++++++++++++++++++++ learning/runners/my_runner.py | 3 +++ learning/storage/dict_storage.py | 4 +--- learning/utils/__init__.py | 5 +---- learning/utils/dict_utils.py | 17 +++++++++++++++++ 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index a5ef5ac4..1a6e475f 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -4,6 +4,7 @@ # from learning.modules import ActorCritic from learning.storage import RolloutStorage +from learning.utils import create_uniform_generator, compute_MC_returns class PPO2: @@ -115,6 +116,34 @@ def update(self): self.update_actor() self.storage.clear() + def update_critic2(self, data): + self.mean_value_loss = 0 + counter = 0 + compute_MC_returns(data, self.gamma, self.critic) + generator = create_uniform_generator(data, 500, self.num_learning_epochs) + # with torch.inference_mode(): + # target_values = self.critic.evaluate(data["critic_obs"]).detach() + for batch in generator: + value_batch = self.critic.evaluate(batch["critic_obs"]) + if self.use_clipped_value_loss: + target_value_batch = batch["values"] + value_clipped = target_value_batch + ( + value_batch - target_value_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (value_batch - target_value_batch).pow(2) + value_losses_clipped = (value_clipped - batch["returns"]).pow(2) + value_loss = torch.max(value_losses, value_losses_clipped).mean() + else: + value_loss = (batch["returns"] - value_batch).pow(2).mean() + + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + def update_critic(self): self.mean_value_loss = 0 counter = 0 diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 08721b5e..7277dda6 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -105,6 +105,7 @@ def learn(self): logger.finish_step(dones) self.alg.process_env_step(total_rewards, dones, timed_out) + # doesn't need detach becuase we have no_inference storage.add_transitions(transition) self.alg.compute_returns(critic_obs) @@ -126,6 +127,7 @@ def learn(self): logger.toc("collection") logger.tic("learning") + self.alg.update_critic2(storage.data) self.alg.update() logger.toc("learning") logger.log_category() @@ -137,6 +139,7 @@ def learn(self): if self.it % self.save_interval == 0: self.save() + storage.clear() self.save() def update_rewards(self, rewards_dict, terminated): diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 3155ca28..8354b179 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -34,6 +34,7 @@ def initialize( device=self.device, ) + @torch.inference_mode def add_transitions(self, transition: TensorDict): if self.fill_count >= self.max_length: raise AssertionError("Rollout buffer overflow") @@ -42,6 +43,3 @@ def add_transitions(self, transition: TensorDict): def clear(self): self.fill_count = 0 - - def mini_batch_generator(self, keys, mini_batch_size, num_epochs): - pass diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index 8559714e..15c674d6 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -3,9 +3,6 @@ remove_zero_weighted_rewards, set_discount_from_horizon, ) -from .dict_utils import ( - compute_generalized_advantages, - compute_MC_returns -) +from .dict_utils import * from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index c1b3cb4f..1480c48e 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -2,6 +2,7 @@ from tensordict import TensorDict +@torch.no_grad def compute_MC_returns(data: TensorDict, gamma, critic=None): if critic is None: last_values = torch.zeros_like(data["rewards"][0]) @@ -18,6 +19,7 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): return +@torch.no_grad def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): if "values" not in data.keys(): data.update({"values": critic.evaluate(data["critic_obs"])}) @@ -50,3 +52,18 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( data["advantages"].std() + 1e-8 ) + + +# todo change num_epochs to num_batches +@torch.no_grad +def create_uniform_generator(data, batch_size, num_epochs, keys=None): + n, m = data.shape + total_data = n * m + num_batches_per_epoch = total_data // batch_size + for epoch in range(num_epochs): + indices = torch.randperm(total_data, device=data.device) + for i in range(num_batches_per_epoch): + batched_data = data.flatten(0, 1)[ + indices[i * batch_size : (i + 1) * batch_size] + ] + yield batched_data From afa0be251745f719f7114792a8f5140674dd8369 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 21 Feb 2024 20:13:14 -0500 Subject: [PATCH 08/69] Learning critic with dict storage now working. Left without boot-strapping on last step, seems like it makes no difference. Boot-strapping on time-outs made a big difference though. --- learning/algorithms/ppo2.py | 29 +++++++++++++++++++++-------- learning/runners/my_runner.py | 17 +++++++++-------- learning/utils/dict_utils.py | 20 ++++++++++++-------- 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 1a6e475f..d4ecca81 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -4,7 +4,10 @@ # from learning.modules import ActorCritic from learning.storage import RolloutStorage -from learning.utils import create_uniform_generator, compute_MC_returns +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, +) class PPO2: @@ -109,18 +112,28 @@ def compute_returns(self, last_critic_obs): self.storage.compute_returns(last_values, self.gamma, self.lam) def update(self): - self.mean_value_loss = 0 - self.mean_surrogate_loss = 0 - - self.update_critic() + # self.update_critic() self.update_actor() self.storage.clear() - def update_critic2(self, data): + def update_critic2(self, data, last_obs=None): self.mean_value_loss = 0 counter = 0 - compute_MC_returns(data, self.gamma, self.critic) - generator = create_uniform_generator(data, 500, self.num_learning_epochs) + + if last_obs is not None: + with torch.no_grad(): + last_values = self.critic.evaluate(last_obs).detach() + else: + last_values = None + # compute_MC_returns(data, self.gamma, self.critic) + compute_generalized_advantages( + data, self.gamma, self.lam, self.critic, last_values + ) + # ! temp hack + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) # with torch.inference_mode(): # target_values = self.critic.evaluate(data["critic_obs"]).detach() for batch in generator: diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 7277dda6..9b3f1939 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -5,7 +5,6 @@ from learning.utils import Logger from learning.utils import PotentialBasedRewardShaping from learning.utils import remove_zero_weighted_rewards -from learning.utils import compute_generalized_advantages from .on_policy_runner import OnPolicyRunner @@ -96,6 +95,7 @@ def learn(self): transition.update( { "rewards": total_rewards, + "timed_out": timed_out, "dones": dones, } ) @@ -117,16 +117,17 @@ def learn(self): # self.alg.lam, # self.alg.critic, # ) - compute_generalized_advantages( - storage.data, - self.alg.gamma, - self.alg.lam, - self.alg.critic, - self.alg.critic.evaluate(critic_obs), - ) + # compute_generalized_advantages( + # storage.data, + # self.alg.gamma, + # self.alg.lam, + # self.alg.critic, + # self.alg.critic.evaluate(critic_obs), + # ) logger.toc("collection") logger.tic("learning") + # self.alg.update_critic2(storage.data, last_obs=critic_obs) self.alg.update_critic2(storage.data) self.alg.update() logger.toc("learning") diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 1480c48e..2d19e072 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -16,25 +16,26 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): data["returns"][k] = ( data["rewards"][k] + gamma * data["returns"][k + 1] * not_done ) + data["returns"] = (data["returns"] - data["returns"].mean()) / ( + data["returns"].std() + 1e-8 + ) return @torch.no_grad def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - if "values" not in data.keys(): - data.update({"values": critic.evaluate(data["critic_obs"])}) + data.update({"values": critic.evaluate(data["critic_obs"])}) - if "advantages" not in data.keys(): - data.update({"advantages": torch.zeros_like(data["values"])}) - else: - data["advantages"].zero_() + data.update({"advantages": torch.zeros_like(data["values"])}) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in + not_done = ~data["dones"][-1] data["advantages"][-1] = ( data["rewards"][-1] - + gamma * last_values * ~data["dones"][-1] + + gamma * data["values"][-1] * data["timed_out"][-1] + + gamma * last_values * not_done - data["values"][-1] ) @@ -42,13 +43,16 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): not_done = ~data["dones"][k] td_error = ( data["rewards"][k] + + gamma * data["values"][k] * data["timed_out"][k] + gamma * data["values"][k + 1] * not_done - data["values"][k] ) data["advantages"][k] = ( td_error + gamma * lam * not_done * data["advantages"][k + 1] ) - # data["returns"] = data["advantages"] + data["values"] + + data["returns"] = data["advantages"] + data["values"] + data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( data["advantages"].std() + 1e-8 ) From 12661f0eb85e12b22587b145f53f2f2f625e6b79 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sun, 25 Feb 2024 15:29:20 -0500 Subject: [PATCH 09/69] add tensordict to reqs --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d08d918a..10e4b8c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ mss ruff pre-commit onnx -onnxruntime \ No newline at end of file +onnxruntime +tensordict \ No newline at end of file From 679a729b330c753e0b8083fb093dd1bd87a485a0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 15:10:36 -0500 Subject: [PATCH 10/69] WIP: works now, but not with normalization turned on --- gym/envs/base/fixed_robot_config.py | 2 +- gym/envs/base/legged_robot_config.py | 2 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 3 +- .../mini_cheetah/mini_cheetah_ref_config.py | 7 - learning/algorithms/ppo.py | 5 + learning/algorithms/ppo2.py | 168 +++++------------- learning/modules/critic.py | 3 +- learning/modules/utils/normalize.py | 57 ++---- learning/runners/BaseRunner.py | 3 - learning/runners/__init__.py | 3 +- learning/runners/my_runner.py | 51 +----- learning/runners/on_policy_runner.py | 54 ++++-- learning/storage/dict_storage.py | 3 + 13 files changed, 113 insertions(+), 248 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index a59cc9eb..4d039f74 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -182,7 +182,7 @@ class algorithm: class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 0de83e0e..2955f5a5 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -300,7 +300,7 @@ class algorithm: class runner: policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1500 save_interval = 50 diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 502dc571..689fcd64 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -124,6 +124,7 @@ class scaling(LeggedRobotCfg.scaling): class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OnPolicyRunner" class policy(LeggedRobotRunnerCfg.policy): actor_hidden_dims = [256, 256, 128] @@ -204,5 +205,5 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" max_iterations = 500 - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 4ff6ebaa..f7699f32 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -123,13 +123,6 @@ class weights: swing_grf = 1.5 stance_grf = 1.5 - class pbrs_weights: - reference_traj = 0.0 - swing_grf = 0.0 - stance_grf = 0.0 - min_base_height = 0.0 - orientation = 0.0 - class termination_weight: termination = 0.15 diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index f492ef0e..92898ad5 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -126,6 +126,11 @@ def act(self, obs, critic_obs): return self.transition.actions def process_env_step(self, rewards, dones, timed_out=None): + if self.storage is None: + raise AttributeError( + "This version of PPO is deprecated and only works with OldPolicyRunner." + "Use PPO2 instead." + ) self.transition.rewards = rewards.clone() self.transition.dones = dones # * Bootstrapping on time outs diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index d4ecca81..ae5226d2 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -2,8 +2,6 @@ import torch.nn as nn import torch.optim as optim -# from learning.modules import ActorCritic -from learning.storage import RolloutStorage from learning.utils import ( create_uniform_generator, compute_generalized_advantages, @@ -45,7 +43,6 @@ def __init__( # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) - self.transition = RolloutStorage.Transition() # * PPO parameters self.clip_param = clip_param @@ -57,23 +54,6 @@ def __init__( self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - def init_storage( - self, - num_envs, - num_transitions_per_env, - actor_obs_shape, - critic_obs_shape, - action_shape, - ): - self.storage = RolloutStorage( - num_envs, - num_transitions_per_env, - actor_obs_shape, - critic_obs_shape, - action_shape, - self.device, - ) - def test_mode(self): self.actor.test() self.critic.test() @@ -83,61 +63,33 @@ def train_mode(self): self.critic.train() def act(self, obs, critic_obs): - # * Compute the actions and values - self.transition.actions = self.actor.act(obs).detach() - self.transition.values = self.critic.evaluate(critic_obs).detach() - self.transition.actions_log_prob = self.actor.get_actions_log_prob( - self.transition.actions - ).detach() - self.transition.action_mean = self.actor.action_mean.detach() - self.transition.action_sigma = self.actor.action_std.detach() - # * need to record obs and critic_obs before env.step() - self.transition.observations = obs - self.transition.critic_observations = critic_obs - return self.transition.actions - - def process_env_step(self, rewards, dones, timed_out=None): - self.transition.rewards = rewards.clone() - self.transition.dones = dones - # * Bootstrapping on time outs - if timed_out is not None: - self.transition.rewards += self.gamma * self.transition.values * timed_out - - # * Record the transition - self.storage.add_transitions(self.transition) - self.transition.clear() - - def compute_returns(self, last_critic_obs): - last_values = self.critic.evaluate(last_critic_obs).detach() - self.storage.compute_returns(last_values, self.gamma, self.lam) - - def update(self): - # self.update_critic() - self.update_actor() - self.storage.clear() - - def update_critic2(self, data, last_obs=None): - self.mean_value_loss = 0 - counter = 0 + return self.actor.act(obs).detach() - if last_obs is not None: + def update(self, data, last_obs=None): + if last_obs is None: + last_values = None + else: with torch.no_grad(): last_values = self.critic.evaluate(last_obs).detach() - else: - last_values = None - # compute_MC_returns(data, self.gamma, self.critic) compute_generalized_advantages( data, self.gamma, self.lam, self.critic, last_values ) - # ! temp hack + + self.update_critic(data) + self.update_actor(data) + + def update_critic(self, data, last_obs=None): + self.mean_value_loss = 0 + counter = 0 + n, m = data.shape total_data = n * m batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) - # with torch.inference_mode(): - # target_values = self.critic.evaluate(data["critic_obs"]).detach() for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) + with torch.inference_mode(): + value_batch = self.critic.evaluate(batch["critic_obs"]) + if self.use_clipped_value_loss: target_value_batch = batch["values"] value_clipped = target_value_batch + ( @@ -157,65 +109,27 @@ def update_critic2(self, data, last_obs=None): counter += 1 self.mean_value_loss /= counter - def update_critic(self): - self.mean_value_loss = 0 + def update_actor(self, data): + # already done before + # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) + self.mean_surrogate_loss = 0 counter = 0 - generator = self.storage.mini_batch_generator( - self.num_mini_batches, self.num_learning_epochs - ) - - # update storage with values evaluated now - - for ( - _, - critic_obs_batch, - _, - target_values_batch, # ! eval - _, - returns_batch, # ! calc - _, - _, - _, - ) in generator: - value_batch = self.critic.evaluate(critic_obs_batch) - if self.use_clipped_value_loss: - value_clipped = target_values_batch + ( - value_batch - target_values_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (returns_batch - value_batch).pow(2).mean() - # * Gradient Step - self.critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) - self.critic_optimizer.step() - self.mean_value_loss += value_loss.item() - counter += 1 - self.mean_value_loss /= counter + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() - def update_actor(self): - self.mean_surrogate_loss = 0 - counter = 0 - generator = self.storage.mini_batch_generator( - self.num_mini_batches, self.num_learning_epochs - ) - for ( - obs_batch, - _, - actions_batch, - _, - advantages_batch, # calc + eval - _, - old_actions_log_prob_batch, # eval? - old_mu_batch, # eval? - old_sigma_batch, # eval? - ) in generator: - self.actor.act(obs_batch) - actions_log_prob_batch = self.actor.get_actions_log_prob(actions_batch) + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + # ! refactor how this is done + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std entropy_batch = self.actor.entropy @@ -224,10 +138,10 @@ def update_actor(self): if self.desired_kl is not None and self.schedule == "adaptive": with torch.inference_mode(): kl = torch.sum( - torch.log(sigma_batch / old_sigma_batch + 1.0e-5) + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + ( - torch.square(old_sigma_batch) - + torch.square(old_mu_batch - mu_batch) + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) ) / (2.0 * torch.square(sigma_batch)) - 0.5, @@ -241,14 +155,16 @@ def update_actor(self): self.learning_rate = min(1e-2, self.learning_rate * 1.5) for param_group in self.optimizer.param_groups: + # ! check this param_group["lr"] = self.learning_rate # * Surrogate loss ratio = torch.exp( - actions_log_prob_batch - torch.squeeze(old_actions_log_prob_batch) + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) ) - surrogate = -torch.squeeze(advantages_batch) * ratio - surrogate_clipped = -torch.squeeze(advantages_batch) * torch.clamp( + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( ratio, 1.0 - self.clip_param, 1.0 + self.clip_param ) surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..9b11f3ee 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -31,5 +31,6 @@ def evaluate(self, critic_observations): return self.NN(critic_observations).squeeze() def normalize(self, observation): - with torch.no_grad(): + # with torch.no_grad(): + with torch.inference_mode(): return self.obs_rms(observation) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index 5623b80c..a8e52fd2 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -2,22 +2,6 @@ import torch.nn as nn -def get_mean_var_with_masks(values, masks): - """ - Applies a mask to the input values and calculates - the mean and variance over the valid entries, as specified - by the mask. - """ - sum_mask = masks.sum() - masked_vals = values * masks - values_mean = masked_vals.sum() / sum_mask - min_sqr = (((masked_vals) ** 2) / sum_mask).sum() - ( - (masked_vals / sum_mask).sum() - ) ** 2 - values_var = min_sqr * sum_mask / (sum_mask - 1) - return values_mean, values_var - - class RunningMeanStd(nn.Module): """ Keeps a running mean to normalize tensor of choice. @@ -44,11 +28,6 @@ def _update_mean_var_from_moments( batch_var, batch_count, ): - """ - Implements parallel algorithm for combining arbitrary sets A and B - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - #Parallel_algorithm - """ tot_count = running_count + batch_count delta = batch_mean - running_mean @@ -64,28 +43,22 @@ def _update_mean_var_from_moments( new_var = M2 / (tot_count - 1) return new_mean, new_var, tot_count - def forward(self, input, mask=None): - """ - Returns the normalized version of the input. - """ + def forward(self, input): if self.training: - if mask is not None: - mean, var = get_mean_var_with_masks(input, mask) - else: - mean = input.mean(self.axis) - var = input.var(self.axis) - ( - self.running_mean, - self.running_var, - self.count, - ) = self._update_mean_var_from_moments( - self.running_mean, - self.running_var, - self.count, - mean, - var, - input.size()[0], - ) + mean = input.mean(self.axis) + var = input.var(self.axis) + ( + self.running_mean, + self.running_var, + self.count, + ) = self._update_mean_var_from_moments( + self.running_mean, + self.running_var, + self.count, + mean, + var, + input.size()[0], + ) current_mean = self.running_mean current_var = self.running_var diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 2c8cda65..53e88835 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -26,9 +26,6 @@ def __init__(self, env: VecEnv, train_cfg, device="cpu"): self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 - - # * init storage and model - self.init_storage() self.log_dir = train_cfg["log_dir"] def parse_train_cfg(self, train_cfg): diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index a24a0c9c..b0f49217 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -31,4 +31,5 @@ # Copyright (c) 2021 ETH Zurich, Nikita Rudin from .on_policy_runner import OnPolicyRunner -from .my_runner import MyRunner \ No newline at end of file +from .my_runner import MyRunner +from .old_policy_runner import OldPolicyRunner \ No newline at end of file diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 9b3f1939..e25ca265 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -3,11 +3,8 @@ from learning.env import VecEnv from learning.utils import Logger -from learning.utils import PotentialBasedRewardShaping -from learning.utils import remove_zero_weighted_rewards from .on_policy_runner import OnPolicyRunner - from learning.storage import DictStorage logger = Logger() @@ -26,11 +23,6 @@ def __init__(self, env: VecEnv, train_cfg, device="cpu"): def learn(self): self.set_up_logger() - remove_zero_weighted_rewards(self.policy_cfg["reward"]["pbrs_weights"]) - PBRS = PotentialBasedRewardShaping( - self.policy_cfg["reward"]["pbrs_weights"], self.device - ) - logger.register_rewards(PBRS.get_reward_keys()) rewards_dict = {} @@ -76,7 +68,6 @@ def learn(self): } ) - PBRS.pre_step(self.env) self.env.step() actor_obs = self.get_noisy_obs( @@ -89,7 +80,6 @@ def learn(self): dones = timed_out | terminated self.update_rewards(rewards_dict, terminated) - rewards_dict.update(PBRS.post_step(self.env, dones)) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) transition.update( @@ -99,37 +89,16 @@ def learn(self): "dones": dones, } ) + storage.add_transitions(transition) logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - # doesn't need detach becuase we have no_inference - storage.add_transitions(transition) - - self.alg.compute_returns(critic_obs) - # compute_MC_returns(storage.data, self.alg.gamma) - # compute_MC_returns(storage.data, self.alg.gamma, self.alg.critic) - # compute_generalized_advantages( - # storage.data, - # self.alg.gamma, - # self.alg.lam, - # self.alg.critic, - # ) - # compute_generalized_advantages( - # storage.data, - # self.alg.gamma, - # self.alg.lam, - # self.alg.critic, - # self.alg.critic.evaluate(critic_obs), - # ) logger.toc("collection") logger.tic("learning") - # self.alg.update_critic2(storage.data, last_obs=critic_obs) - self.alg.update_critic2(storage.data) - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") logger.log_category() @@ -143,20 +112,6 @@ def learn(self): storage.clear() self.save() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated - ) - ) - rewards_dict.update( - self.get_rewards( - self.policy_cfg["reward"]["weights"], - modifier=self.env.dt, - mask=~terminated, - ) - ) - def set_up_logger(self): logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) logger.register_rewards( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9820d749..4cef9ec4 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -1,12 +1,15 @@ import os import torch +from tensordict import TensorDict from learning.env import VecEnv from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.storage import DictStorage logger = Logger() +storage = DictStorage() class OnPolicyRunner(BaseRunner): @@ -28,9 +31,22 @@ def learn(self): actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - + rewards_dict self.save() + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize(transition, device=self.device) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -45,6 +61,14 @@ def learn(self): self.policy_cfg["disable_actions"], ) + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + self.env.step() actor_obs = self.get_noisy_obs( @@ -59,16 +83,23 @@ def learn(self): self.update_rewards(rewards_dict, terminated) total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - - self.alg.process_env_step(total_rewards, dones, timed_out) - self.alg.compute_returns(critic_obs) logger.toc("collection") logger.tic("learning") - self.alg.update() + self.alg.update(storage.data) + storage.clear() logger.toc("learning") logger.log_category() @@ -79,6 +110,7 @@ def learn(self): if self.it % self.save_interval == 0: self.save() + storage.clear() self.save() def update_rewards(self, rewards_dict, terminated): @@ -139,15 +171,3 @@ def get_inference_actions(self): def export(self, path): self.alg.actor_critic.export_policy(path) - - def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - self.alg.init_storage( - self.env.num_envs, - self.num_steps_per_env, - actor_obs_shape=[num_actor_obs], - critic_obs_shape=[num_critic_obs], - action_shape=[num_actions], - ) diff --git a/learning/storage/dict_storage.py b/learning/storage/dict_storage.py index 8354b179..2191093c 100644 --- a/learning/storage/dict_storage.py +++ b/learning/storage/dict_storage.py @@ -43,3 +43,6 @@ def add_transitions(self, transition: TensorDict): def clear(self): self.fill_count = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() From 6dc86fd23869cbce2141b774f4070639759339f7 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 15:18:30 -0500 Subject: [PATCH 11/69] fix normalization, make it work for (n, m, var_dim) shaped data --- learning/algorithms/ppo2.py | 4 +--- learning/modules/critic.py | 3 +-- learning/modules/utils/normalize.py | 7 +++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index ae5226d2..69d6dcd2 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -87,9 +87,7 @@ def update_critic(self, data, last_obs=None): batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: - with torch.inference_mode(): - value_batch = self.critic.evaluate(batch["critic_obs"]) - + value_batch = self.critic.evaluate(batch["critic_obs"]) if self.use_clipped_value_loss: target_value_batch = batch["values"] value_clipped = target_value_batch + ( diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 9b11f3ee..47e0decf 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -31,6 +31,5 @@ def evaluate(self, critic_observations): return self.NN(critic_observations).squeeze() def normalize(self, observation): - # with torch.no_grad(): - with torch.inference_mode(): + with torch.no_grad(): return self.obs_rms(observation) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index a8e52fd2..bbea22da 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -7,10 +7,9 @@ class RunningMeanStd(nn.Module): Keeps a running mean to normalize tensor of choice. """ - def __init__(self, num_items, axis=0, epsilon=1e-05): + def __init__(self, num_items, epsilon=1e-05): super(RunningMeanStd, self).__init__() self.num_items = num_items - self.axis = axis self.epsilon = epsilon self.register_buffer( @@ -45,8 +44,8 @@ def _update_mean_var_from_moments( def forward(self, input): if self.training: - mean = input.mean(self.axis) - var = input.var(self.axis) + mean = input.mean(tuple(range(input.dim() - 1))) + var = input.var(tuple(range(input.dim() - 1))) ( self.running_mean, self.running_var, From 9fc831e6546ddc5c9f9c2b409ca0066ce4446d68 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 15:41:23 -0500 Subject: [PATCH 12/69] some cleanup and fixes on normalization --- .../mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/modules/actor.py | 14 +++------- learning/modules/critic.py | 7 ++--- learning/modules/utils/normalize.py | 28 ++++++++----------- 4 files changed, 19 insertions(+), 32 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index f7699f32..7911206a 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = False + normalize_obs = True actor_obs = [ "base_ang_vel", "projected_gravity", diff --git a/learning/modules/actor.py b/learning/modules/actor.py index f9c82396..7f40d502 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -51,9 +51,7 @@ def entropy(self): return self.distribution.entropy().sum(dim=-1) def update_distribution(self, observations): - if self._normalize_obs: - observations = self.normalize(observations) - mean = self.NN(observations) + mean = self.act_inference(observations) self.distribution = Normal(mean, mean * 0.0 + self.std) def act(self, observations): @@ -65,16 +63,12 @@ def get_actions_log_prob(self, actions): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) - actions_mean = self.NN(observations) - return actions_mean + with torch.no_grad(): + observations = self.obs_rms(observations) + return self.NN(observations) def forward(self, observations): return self.act_inference(observations) - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) - def export(self, path): export_network(self, "policy", path, self.num_obs) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..7ebe76c8 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -27,9 +27,6 @@ def __init__( def evaluate(self, critic_observations): if self._normalize_obs: - critic_observations = self.normalize(critic_observations) + with torch.no_grad(): + critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() - - def normalize(self, observation): - with torch.no_grad(): - return self.obs_rms(observation) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index bbea22da..d00b69b5 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -3,10 +3,6 @@ class RunningMeanStd(nn.Module): - """ - Keeps a running mean to normalize tensor of choice. - """ - def __init__(self, num_items, epsilon=1e-05): super(RunningMeanStd, self).__init__() self.num_items = num_items @@ -46,18 +42,18 @@ def forward(self, input): if self.training: mean = input.mean(tuple(range(input.dim() - 1))) var = input.var(tuple(range(input.dim() - 1))) - ( - self.running_mean, - self.running_var, - self.count, - ) = self._update_mean_var_from_moments( - self.running_mean, - self.running_var, - self.count, - mean, - var, - input.size()[0], - ) + ( + self.running_mean, + self.running_var, + self.count, + ) = self._update_mean_var_from_moments( + self.running_mean, + self.running_var, + self.count, + mean, + var, + input.size()[0], + ) current_mean = self.running_mean current_var = self.running_var From c4912132145c42112e183697b01e7907e2c75d8a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 26 Feb 2024 16:06:38 -0500 Subject: [PATCH 13/69] Pin pytest below 8.0 (tested with 7.4.4) Pytest v8 causes import errors: ``` E _pytest.nodes.Collector.CollectError: ImportError while importing test module '/home/heim/Repos/QGym/gym/utils/__init__.py'. E Hint: make sure your test modules/packages have valid Python names. E Traceback: E ../../.pyenv/versions/3.8.18/lib/python3.8/importlib/__init__.py:127: in import_module E return _bootstrap._gcd_import(name[level:], package, level) E gym/utils/__init__.py:31: in E from .helpers import ( E gym/utils/helpers.py:36: in E from isaacgym import gymapi E ../isaacgym/python/isaacgym/__init__.py:5: in E from isaacgym import gymapi E ../isaacgym/python/isaacgym/gymapi.py:21: in E from . import gymdeps E ../isaacgym/python/isaacgym/gymdeps.py:63: in E _import_deps() E ../isaacgym/python/isaacgym/gymdeps.py:21: in _import_deps E raise ImportError("PyTorch was imported before isaacgym modules. Please import torch after isaacgym modules.") E E ImportError: PyTorch was imported before isaacgym modules. Please import torch after isaacgym modules. ../../.local/share/virtualenvs/QGym-vLS1XcJm/lib/python3.8/site-packages/_pytest/python.py:563: CollectError ==================================================== short test summary info ==================================================== ERROR gym/envs/mit_humanoid/test_coupling.py::test_coupling - _pytest.nodes.Collector.CollectError: ImportError while importing test module '/home/heim/Repos/QGym/gym/envs/__init__.py'. ERROR gym/utils/math/test_math.py::test_wrap_to_pi - _pytest.nodes.Collector.CollectError: ImportError while importing test module '/home/heim/Repos/QGym/gym/utils/__init__.py'. ERROR gym/utils/math/test_math.py::test_torch_rand_sqrt_float - _pytest.nodes.Collector.CollectError: ImportError while importing test module '/home/heim/Repos/QGym/gym/utils/__init__.py'. ERROR gym/utils/math/test_math.py::test_exp_avg_filter - _pytest.nodes.Collector.CollectError: ImportError while importing test module '/home/heim/Repos/QGym/gym/utils/__init__.py'. ``` --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d08d918a..e2a6fe2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ torch>=2.0 torchvision torchaudio pygame -pytest +pytest<8.0 mss ruff pre-commit From af0de68741bcdd6baf2a35057d20d2faacca674d Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 1 Mar 2024 11:21:17 -0500 Subject: [PATCH 14/69] training critic off of frozen policy finished + tweaks --- gym/envs/pendulum/pendulum_config.py | 1 - gym/utils/helpers.py | 6 ++++-- learning/runners/__init__.py | 1 + learning/runners/data_logging_runner.py | 1 + scripts/lqrc/brute_force_value_function.py | 13 +++++++------ 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 0fc99016..46e52923 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -117,4 +117,3 @@ class runner(FixedRobotCfgPPO.runner): max_iterations = 500 # number of policy updates algorithm_class_name = "PPO" num_steps_per_env = 32 - diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index a7363148..4cba1a02 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -183,15 +183,17 @@ def update_cfg_from_args(env_cfg, train_cfg, args): train_cfg.runner.device = args.rl_device if args.train_critic_only: train_cfg.runner.algorithm_class_name = "PPOCriticOnly" + train_cfg.runner_class_name = "CriticOnlyRunner" + train_cfg.runner.resume = True train_cfg.runner.experiment_name += "_critic_only" if args.task == "pendulum" and args.custom_critic: train_cfg.policy.standard_critic_nn = False train_cfg.algorithm.standard_loss = False - train_cfg.runner.run_name += "custom_critic" + train_cfg.runner.run_name += "custom_critic_only" elif args.task == "pendulum" and not args.custom_critic: train_cfg.policy.standard_critic_nn = True train_cfg.algorithm.standard_loss = True - train_cfg.runner.run_name += "standard_critic" + train_cfg.runner.run_name += "standard_critic_only" else: if args.task == "pendulum" and args.custom_critic: train_cfg.policy.standard_critic_nn = False diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index 7f474ff2..bd8ddf5a 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -33,3 +33,4 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner from .data_logging_runner import DataLoggingRunner +from .critic_only_runner import CriticOnlyRunner diff --git a/learning/runners/data_logging_runner.py b/learning/runners/data_logging_runner.py index f374d84c..4bca88bc 100644 --- a/learning/runners/data_logging_runner.py +++ b/learning/runners/data_logging_runner.py @@ -100,6 +100,7 @@ def learn(self): if not os.path.exists(dir_path): os.makedirs(dir_path) np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") self.save() diff --git a/scripts/lqrc/brute_force_value_function.py b/scripts/lqrc/brute_force_value_function.py index fd3baeb8..a76c988b 100644 --- a/scripts/lqrc/brute_force_value_function.py +++ b/scripts/lqrc/brute_force_value_function.py @@ -10,6 +10,7 @@ from learning.modules.lqrc.plotting import ( plot_value_func, plot_value_func_error, + plot_training_data_dist, ) from isaacgym import gymtorch @@ -204,15 +205,15 @@ def query_value_function(vf_args, grid): # get NN value functions custom_vf_args = { - "experiment_name": "pendulum_custom_critic", - "load_run": "Feb22_14-11-04_custom_critic", + "experiment_name": "pendulum_critic_only", + "load_run": "Mar01_11-03-27_custom_critic_only", "checkpoint": -1, "model_type": "CholeskyPlusConst", } custom_critic_returns = query_value_function(custom_vf_args, grid) standard_vf_args = { - "experiment_name": "pendulum_standard_critic", - "load_run": "Feb22_14-07-29_standard_critic", + "experiment_name": "pendulum_critic_only", + "load_run": "Mar01_11-06-37_standard_critic_only", "checkpoint": -1, "model_type": "StandardMLP", } @@ -237,5 +238,5 @@ def query_value_function(vf_args, grid): ) # ! store data dist with model logs to ensure they're paired properly - # plot_training_data_dist(npy_fn, - # save_path + "/data_distribution.png") + plot_training_data_dist(npy_fn, + save_path + "/data_distribution.png") From 0a2d28bcae6148f4c032ed1d3c8369e9389c38be Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:06:27 -0500 Subject: [PATCH 15/69] new NNs and graphing tweaks --- gym/envs/pendulum/pendulum.py | 4 ++ learning/modules/lqrc/custom_nn.py | 80 +++++++++++++++++++++ learning/modules/lqrc/evaluate_critic.py | 4 +- learning/modules/lqrc/plotting.py | 34 ++++++--- learning/modules/lqrc/train_benchmark.py | 81 +++++++++++++++------- scripts/lqrc/brute_force_value_function.py | 57 +++++++-------- 6 files changed, 196 insertions(+), 64 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 7e81a51f..5d3c07f6 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -25,6 +25,10 @@ def _reward_equilibrium(self): # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma # ) + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) + def _reward_energy(self): m_pendulum = 1.0 l_pendulum = 1.0 diff --git a/learning/modules/lqrc/custom_nn.py b/learning/modules/lqrc/custom_nn.py index b47357e0..7cdcac2e 100644 --- a/learning/modules/lqrc/custom_nn.py +++ b/learning/modules/lqrc/custom_nn.py @@ -151,6 +151,86 @@ def forward(self, y_pred, y, intermediates=None): return loss + const_loss +class CholeskyOffset1(QuadraticNetCholesky): + def __init__(self, input_size, hidden_dims=128, device="cuda"): + # + input_size for offset + super(QuadraticNetCholesky, self).__init__( + input_size, + sum(range(input_size + 1)) + input_size, + hidden_dims=hidden_dims, + device=device, + ) + self.L_indices = sum(range(input_size + 1)) + self.activation_3.register_forward_hook(self.save_intermediate()) + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output[:, :self.L_indices]) + A = C.bmm(C.transpose(1, 2)) + offset = output[:, self.L_indices:] + x_bar = x - offset + y_pred = (x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2))).squeeze(2) + return y_pred + + def save_intermediate(self): + """ + Forward hook to save A and offset + """ + def hook(module, input, output): + C = self.create_cholesky(output[:, :self.L_indices]) + A = C.bmm(C.transpose(1, 2)) + offset = output[:, self.L_indices:] + self.intermediates["A"] = A + self.intermediates["offset"] = offset + + return hook + + +class CholeskyOffset2(QuadraticNetCholesky): + def __init__(self, input_size, hidden_dims=128, device="cuda"): + super().__init__(input_size) + self.xhat_layer = nn.Linear(input_size, input_size) + # self.xhat_layer.register_forward_hook(self.save_xhat()) + + def forward(self, x): + xhat = self.xhat_layer(x) + output = self.connection_1(xhat) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + y_pred = (xhat.unsqueeze(2).transpose(1, 2).bmm(A).bmm(xhat.unsqueeze(2))).squeeze(2) + return y_pred + + def save_intermediate(self): + """ + Forward hook to save A + """ + def hook(module, input, output): + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + self.intermediates["A"] = A + + return hook + + def save_xhat(self): + """ + Forward hook to save xhat for debugging + """ + def hook(module, input, output): + self.intermediates["xhat"] = output + + return hook + + class LQRCDataset(Dataset): def __init__(self, X, y): """ diff --git a/learning/modules/lqrc/evaluate_critic.py b/learning/modules/lqrc/evaluate_critic.py index 9b61d963..f4401900 100644 --- a/learning/modules/lqrc/evaluate_critic.py +++ b/learning/modules/lqrc/evaluate_critic.py @@ -59,8 +59,8 @@ def model_switch(args): c_pred = [] model.eval() - t_c = 0 - p_c = 0 + # t_c = 0 + # p_c = 0 for X_batch in x: y_hat = model.evaluate(X_batch.unsqueeze(0)) y_pred.append(y_hat) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index aa862b3c..840f1517 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -4,7 +4,7 @@ from matplotlib.colors import CenteredNorm, TwoSlopeNorm -def graph_3D_helper(ax, contour): +def graph_3D_helper(ax, contour=False): if contour: return ax.contourf return ax.pcolormesh @@ -198,13 +198,13 @@ def plot_predictions_and_gradients( y_pred = y_pred.detach().cpu().numpy() y_actual = y_actual.detach().cpu().numpy() if actual_grad is not None: - actual_grad = actual_grad.detach().cpu().numpy() + actual_grad = actual_grad.detach().cpu().numpy() if not isinstance(actual_grad, np.ndarray) else actual_grad if colormap_diff: fig, (ax1, ax2) = plt.subplots( nrows=1, ncols=2, figsize=(16, 8), layout="tight" ) sq_len = int(sqrt(x_actual.shape[0])) - img1 = ax1.contourf( + img1 = graph_3D_helper(ax1)( x_actual[:, 0].reshape(sq_len, sq_len), x_actual[:, 1].reshape(sq_len, sq_len), np.linalg.norm( @@ -214,16 +214,16 @@ def plot_predictions_and_gradients( - actual_grad, axis=1, ).reshape(sq_len, sq_len), - cmap="RdBu_r", - norm=TwoSlopeNorm(0), + cmap="OrRd", + vmin=0.0, vmax=30.0, ) plt.colorbar(img1, ax=ax1) - img2 = ax2.contourf( + img2 = graph_3D_helper(ax2)( x_actual[:, 0].reshape(sq_len, sq_len), x_actual[:, 1].reshape(sq_len, sq_len), (y_pred - y_actual).reshape(sq_len, sq_len), - cmap="PuOr", - norm=TwoSlopeNorm(0), + cmap="bwr", + norm=CenteredNorm() ) plt.colorbar(img2, ax=ax2) set_titles_labels( @@ -243,7 +243,7 @@ def plot_predictions_and_gradients( c=y_pred, cmap="viridis", marker="o", - edgecolors="k", + alpha=0.7 ) scatter = axes[1].scatter( x_actual[:, 0], @@ -251,7 +251,7 @@ def plot_predictions_and_gradients( c=y_actual, cmap="viridis", marker="^", - edgecolors="k", + alpha=0.7 ) fig.colorbar(scatter, ax=axes.ravel().tolist(), shrink=0.95, label="f(x)") axes[0].set_title("Pointwise Predictions") @@ -278,6 +278,20 @@ def plot_predictions_and_gradients( for a in [45, 135, 180, 225]: ax.view_init(elev=10.0, azim=a) plt.savefig(f"{fn}_angle_{a}.png") + if colormap_diff: + fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(6, 6)) + scatter = axis.scatter( + x_actual[:, 0], + x_actual[:, 1], + c=y_pred - y_actual, + cmap="bwr", + marker="o", + alpha=0.5, + norm=CenteredNorm(), + ) + fig.colorbar(scatter, ax=axis, shrink=0.95, label="f(x)") + axis.set_title("Error Between Pointwise Predictions and Targets") + plt.savefig(f"{fn}_value_error_colormap.png") elif dim == 2: x_actual = x_actual.detach().cpu().numpy() y_pred = y_pred.detach().cpu().numpy() diff --git a/learning/modules/lqrc/train_benchmark.py b/learning/modules/lqrc/train_benchmark.py index 69058087..a321c208 100644 --- a/learning/modules/lqrc/train_benchmark.py +++ b/learning/modules/lqrc/train_benchmark.py @@ -1,6 +1,8 @@ import os import time +import numpy as np import torch +from learning.modules.utils.neural_net import create_MLP from torchviz import make_dot from torch.utils.data import DataLoader from torch.utils.data import random_split @@ -11,6 +13,8 @@ CustomCholeskyLoss, CholeskyPlusConst, CustomCholeskyPlusConstLoss, + CholeskyOffset1, + CholeskyOffset2, ) from learning import LEGGED_GYM_LQRC_DIR from utils import benchmark_args @@ -100,6 +104,23 @@ def bound_function_smoothly(y): return X, y +def load_ground_truth(*args, **kwargs): + fn = f"{LEGGED_GYM_LQRC_DIR}/logs/ground_truth.npy" + data = torch.from_numpy(np.load(fn)).to(DEVICE).type(torch.float32) + return data[:, :-1], data[:, -1].unsqueeze(1) + + +def ground_truth_numerial_gradients(): + fn = f"{LEGGED_GYM_LQRC_DIR}/logs/ground_truth.npy" + data = np.load(fn) + sq_size = int(np.sqrt(data.shape[0]).item()) + gridded_data = data[:, -1].reshape((sq_size, sq_size)) + h_theta = abs(data[0, 0] - data[sq_size, 0]).item() + h_omega = abs(data[0, 1] - data[1, 1]).item() + theta_grad, theta_omega = np.gradient(gridded_data, h_theta, h_omega) + return np.concatenate((theta_grad.flatten().reshape(-1, 1), theta_omega.flatten().reshape(-1, 1)), axis=1) + + def model_switch(input_dim, model_name=None): if model_name == "QuadraticNetCholesky": return QuadraticNetCholesky(input_dim, device=DEVICE).to( @@ -109,10 +130,17 @@ def model_switch(input_dim, model_name=None): return CholeskyPlusConst(input_dim, device=DEVICE).to( DEVICE ), CustomCholeskyPlusConstLoss(const_penalty=0.1) - else: - return BaselineMLP(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( - reduction="mean" - ) + elif model_name == "VanillaCritic": + num_obs = input_dim + hidden_dims = [128, 64, 32] + activation = "tanh" + return create_MLP(num_obs, 1, hidden_dims, activation).to(DEVICE), torch.nn.MSELoss(reduction="mean") + elif model_name == "CholeskyOffset1": + return CholeskyOffset1(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss(reduction="mean") + elif model_name == "CholeskyOffset2": + return CholeskyOffset2(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss(reduction="mean") + return BaselineMLP(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( + reduction="mean") def test_case_switch(case_name=None): @@ -122,8 +150,17 @@ def test_case_switch(case_name=None): return generate_cos elif case_name == "quadratic": return generate_nD_quadratic - else: - assert case_name is not None, "Please specify a valid test case when running." + elif case_name == "ground_truth": + return load_ground_truth + assert case_name is not None, "Please specify a valid test case when running." + + +def gradient_switch(test_case, **kwargs): + if test_case == "rosenbrock": + return generate_rosenbrock_grad(torch.vstack(kwargs["all_inputs"])) + elif test_case == "ground_truth": + return ground_truth_numerial_gradients() + return None if __name__ == "__main__": @@ -163,11 +200,7 @@ def test_case_switch(case_name=None): model, loss_fn = model_switch(input_dim, model_type) print(model) - optimizer = torch.optim.Adam( - model.parameters(), - betas=(0.75, 0.9), - weight_decay=2.074821225483474e-07, - ) + optimizer = torch.optim.Adam(model.parameters()) # train model.train() @@ -179,11 +212,7 @@ def test_case_switch(case_name=None): for X_batch, y_batch in train_dataloader: X_batch.requires_grad_() y_pred = model(X_batch) - loss = ( - loss_fn(y_pred, y_batch) - if model_type == "BaselineMLP" - else loss_fn(y_pred, y_batch, intermediates=model.intermediates) - ) + loss = loss_fn(y_pred, y_batch, intermediates=model.intermediates) if model_type == "CholeskyPlusConst" else loss_fn(y_pred, y_batch) loss_per_batch.append(loss.item()) optimizer.zero_grad() loss.backward() @@ -213,18 +242,20 @@ def test_case_switch(case_name=None): batch_size=1, shuffle=False, ) + if args["test_case"] == "ground_truth": + graphing_data = LQRCDataset(*load_ground_truth()) + test_dataloader = DataLoader( + torch.utils.data.Subset(graphing_data, range(len(graphing_data))), + batch_size=1, + shuffle=False, + ) for X_batch, y_batch in test_dataloader: # give X_batch a grad_fn X_batch.requires_grad_() X_batch = X_batch + 0 y_pred = model(X_batch) - loss = ( - loss_fn(y_pred, y_batch) - if model_type == "BaselineMLP" - else loss_fn(y_pred, y_batch, intermediates=model.intermediates) - ) - # could wrap BaselineMLP in a loss that ignores intermediates to eliminate if/else + loss = loss_fn(y_pred, y_batch, intermediates=model.intermediates) if model_type == "CholeskyPlusConst" else loss_fn(y_pred, y_batch) if model_type == "QuadraticNetCholesky": # ! do we want this anymore? A_pred = model.intermediates["A"] @@ -268,6 +299,8 @@ def test_case_switch(case_name=None): ).render(f"{LEGGED_GYM_LQRC_DIR}/logs/model_viz", format="png") print("Saving to", save_path) + grad_kwargs = {"all_inputs": all_inputs} if test_case == "rosenbrock" else {} + plot_predictions_and_gradients( input_dim + 1, torch.vstack(all_inputs), @@ -277,9 +310,7 @@ def test_case_switch(case_name=None): f"{save_path}/{time_str}_grad_graph", colormap_diff=args["colormap_diff"], colormap_values=args["colormap_values"], - actual_grad=generate_rosenbrock_grad(torch.vstack(all_inputs)) - if test_case == "rosenbrock" - else None, + actual_grad=gradient_switch(test_case, **grad_kwargs), ) plot_loss(training_losses, f"{save_path}/{time_str}_loss") diff --git a/scripts/lqrc/brute_force_value_function.py b/scripts/lqrc/brute_force_value_function.py index a76c988b..3463e2d6 100644 --- a/scripts/lqrc/brute_force_value_function.py +++ b/scripts/lqrc/brute_force_value_function.py @@ -175,33 +175,36 @@ def query_value_function(vf_args, grid): env, runner, train_cfg = setup(args) ground_truth_returns, logs = get_ground_truth(env, runner, train_cfg, grid) - logs = {k: v.detach().cpu().numpy() for k, v in logs.items()} - np.savez_compressed(os.path.join(save_path, "data.npz"), **logs) - print("saved logs to", os.path.join(save_path, "data.npz")) - - high_gt_returns = [] - for i in range(ground_truth_returns.shape[1] - 1): - if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: - high_gt_returns.append( - torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() - ) - if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: - high_gt_returns.append( - torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() - ) - high_gt_returns = np.array(high_gt_returns) - returns_save_path = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" - if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" - ) - returns_save_path = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" - if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" - ) - np.save(returns_save_path, high_gt_returns) - print("Saved high returns to", returns_save_path) + grid_gt_combined = np.hstack((grid.detach().cpu().numpy(), ground_truth_returns.T)) + gt_save_path = f"{LEGGED_GYM_LQRC_DIR}/logs/ground_truth.npy" + np.save(gt_save_path, grid_gt_combined) + print("Saved high returns to", gt_save_path) + + exit() + + + # logs = {k: v.detach().cpu().numpy() for k, v in logs.items()} + # np.savez_compressed(os.path.join(save_path, "data.npz"), **logs) + # print("saved logs to", os.path.join(save_path, "data.npz")) + + # high_gt_returns = [] + # for i in range(ground_truth_returns.shape[1] - 1): + # if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: + # high_gt_returns.append( + # torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() + # ) + # if ground_truth_returns[0, i] > 3.5 and ground_truth_returns[0, i + 1] < 2.0: + # high_gt_returns.append( + # torch.hstack((grid[i, :], grid[i + 1, :])).detach().cpu().numpy() + # ) + # high_gt_returns = np.array(high_gt_returns) + # returns_save_path = ( + # f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" + # if args.custom_critic + # else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" + # ) + # np.save(returns_save_path, high_gt_returns) + # print("Saved high returns to", returns_save_path) # get NN value functions custom_vf_args = { From 29d3de24a2bcff525530ab2f754b6b4a55466d72 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 4 Mar 2024 15:16:07 -0500 Subject: [PATCH 16/69] update logger to allow flexibly adding arbitrary categories for per-iteration logging. --- learning/runners/my_runner.py | 11 ++++++++++- learning/runners/on_policy_runner.py | 2 +- learning/utils/logger/Logger.py | 20 ++++++++++++-------- learning/utils/logger/test_logger.py | 3 +-- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index e555b525..0cc5cfdf 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -77,7 +77,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") @@ -112,6 +112,15 @@ def set_up_logger(self): "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) + logger.register_category( + "actor", + self.alg.actor_critic, + [ + "action_std", + "entropy", + ], + ) + logger.attach_torch_obj_to_wandb( (self.alg.actor_critic.actor, self.alg.actor_critic.critic) ) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index e2fb524e..1c0fdfe8 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -69,7 +69,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") diff --git a/learning/utils/logger/Logger.py b/learning/utils/logger/Logger.py index f84626df..bcaa56a2 100644 --- a/learning/utils/logger/Logger.py +++ b/learning/utils/logger/Logger.py @@ -122,19 +122,23 @@ def separator(subtitle="", marker="-"): ) print(log_string) - def log_category(self, category="algorithm"): - self.iteration_logs.log(category) + def log_all_categories(self): + for category in self.iteration_logs.logs.keys(): + self.iteration_logs.log(category) def log_to_wandb(self): - def prepend_to_keys(section, dictionary): - return {section + "/" + key: val for key, val in dictionary.items()} + def prepend_to_keys(prefix, dictionary): + return {prefix + "/" + key: val for key, val in dictionary.items()} averages = prepend_to_keys("rewards", self.reward_logs.get_average_rewards()) - algorithm_logs = prepend_to_keys( - "algorithm", self.iteration_logs.get_all_logs("algorithm") - ) - wandb.log({**averages, **algorithm_logs}) + category_logs = { + f"{category}/{key}": val + for category in self.iteration_logs.logs.keys() + for key, val in self.iteration_logs.get_all_logs(category).items() + } + + wandb.log({**averages, **category_logs}) def tic(self, category="default"): self.timer.tic(category) diff --git a/learning/utils/logger/test_logger.py b/learning/utils/logger/test_logger.py index ebd0b859..eaef76f6 100644 --- a/learning/utils/logger/test_logger.py +++ b/learning/utils/logger/test_logger.py @@ -126,8 +126,7 @@ def test_logging_iteration(): logger.register_category("performanceAB", mocky, ["a", "b"]) for _ in range(10): - logger.log_category("performanceA") - logger.log_category("performanceAB") + logger.log_all_categories() logger.finish_iteration() for key, val in logger.iteration_logs.logs["performanceA"].items(): From 0bd395a21b3e547e1a63c62605c57e7afd69d029 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 6 Mar 2024 11:01:07 -0500 Subject: [PATCH 17/69] log actor things in on_policy_runner as well. --- learning/runners/my_runner.py | 8 +------- learning/runners/on_policy_runner.py | 3 +++ learning/storage/base_storage.py | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 0cc5cfdf..c3b0b477 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -111,14 +111,8 @@ def set_up_logger(self): logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", - self.alg.actor_critic, - [ - "action_std", - "entropy", - ], + "actor", self.alg.actor_critic, ["action_std", "entropy"] ) logger.attach_torch_obj_to_wandb( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 1c0fdfe8..3a40974a 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -103,6 +103,9 @@ def set_up_logger(self): logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) + logger.register_category( + "actor", self.alg.actor_critic, ["action_std", "entropy"] + ) logger.attach_torch_obj_to_wandb( (self.alg.actor_critic.actor, self.alg.actor_critic.critic) diff --git a/learning/storage/base_storage.py b/learning/storage/base_storage.py index 66a78328..835d3b07 100644 --- a/learning/storage/base_storage.py +++ b/learning/storage/base_storage.py @@ -1,4 +1,4 @@ -""" Store data for different loop +"""Store data for different loop Design consideration: change the original "has-a" relationship of LT->rollout->transition to parallel. Transition stores all the needed data for every step and will be cleared/updated every step and passed to rollout and From 9a043755e817760b64c8ca872afbbb11e3d7bab9 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:34:36 -0500 Subject: [PATCH 18/69] small fixes --- learning/modules/lqrc/custom_nn.py | 2 +- learning/modules/lqrc/plotting.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/learning/modules/lqrc/custom_nn.py b/learning/modules/lqrc/custom_nn.py index 7cdcac2e..ee9c9ff0 100644 --- a/learning/modules/lqrc/custom_nn.py +++ b/learning/modules/lqrc/custom_nn.py @@ -195,7 +195,7 @@ class CholeskyOffset2(QuadraticNetCholesky): def __init__(self, input_size, hidden_dims=128, device="cuda"): super().__init__(input_size) self.xhat_layer = nn.Linear(input_size, input_size) - # self.xhat_layer.register_forward_hook(self.save_xhat()) + self.xhat_layer.register_forward_hook(self.save_xhat()) def forward(self, x): xhat = self.xhat_layer(x) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 840f1517..aa8bf02a 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -1,8 +1,17 @@ from math import sqrt +import matplotlib import matplotlib.pyplot as plt import numpy as np from matplotlib.colors import CenteredNorm, TwoSlopeNorm +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["ps.fonttype"] = 42 +matplotlib.rcParams["mathtext.fontset"] = "stix" +matplotlib.rcParams["font.family"] = "STIXGeneral" + +font = {"size": 12} +matplotlib.rc("font", **font) + def graph_3D_helper(ax, contour=False): if contour: @@ -223,7 +232,7 @@ def plot_predictions_and_gradients( x_actual[:, 1].reshape(sq_len, sq_len), (y_pred - y_actual).reshape(sq_len, sq_len), cmap="bwr", - norm=CenteredNorm() + vmin=-2.0, vmax=2.0 ) plt.colorbar(img2, ax=ax2) set_titles_labels( @@ -287,7 +296,7 @@ def plot_predictions_and_gradients( cmap="bwr", marker="o", alpha=0.5, - norm=CenteredNorm(), + norm=CenteredNorm() ) fig.colorbar(scatter, ax=axis, shrink=0.95, label="f(x)") axis.set_title("Error Between Pointwise Predictions and Targets") From b5d18232817d6ab7e3bb2ff376108a5f4b7c4f7e Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 13 Mar 2024 13:12:01 -0400 Subject: [PATCH 19/69] save logs under `logs/lqrc`, as the project folder some refactoring for appeasing ruff --- gym/envs/__init__.py | 7 - gym/envs/mit_humanoid/humanoid_bouncing.py | 443 ------------------ .../mit_humanoid/humanoid_bouncing_config.py | 296 ------------ learning/modules/lqrc/custom_nn.py | 19 +- learning/modules/lqrc/evaluate_critic.py | 4 +- learning/modules/lqrc/plotting.py | 20 +- learning/modules/lqrc/run_plot.py | 6 +- learning/modules/lqrc/train_benchmark.py | 42 +- learning/modules/utils/normalize.py | 2 - learning/runners/critic_only_runner.py | 3 +- learning/runners/data_logging_runner.py | 7 +- scripts/lqrc/brute_force_value_function.py | 17 +- scripts/lqrc/rollout_init_conditions.py | 17 +- scripts/lqrc/simple_plots.py | 6 +- 14 files changed, 86 insertions(+), 803 deletions(-) delete mode 100644 gym/envs/mit_humanoid/humanoid_bouncing.py delete mode 100644 gym/envs/mit_humanoid/humanoid_bouncing_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 763007cc..57cbd8c0 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,7 +19,6 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "HumanoidBouncing": ".mit_humanoid.humanoid_bouncing", "Pendulum": ".pendulum.pendulum", } @@ -32,7 +31,6 @@ "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", - "HumanoidBouncingCfg": ".mit_humanoid.humanoid_bouncing_config", "PendulumCfg": ".pendulum.pendulum_config", } @@ -45,7 +43,6 @@ "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", - "HumanoidBouncingRunnerCfg": ".mit_humanoid.humanoid_bouncing_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", } @@ -68,10 +65,6 @@ "HumanoidRunningCfg", "HumanoidRunningRunnerCfg", ], - "humanoid_bouncing": ["HumanoidBouncing", - "HumanoidBouncingCfg", - "HumanoidBouncingRunnerCfg"], - "a1": ["A1", "A1Cfg", "A1RunnerCfg"], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] } diff --git a/gym/envs/mit_humanoid/humanoid_bouncing.py b/gym/envs/mit_humanoid/humanoid_bouncing.py deleted file mode 100644 index 6dedf8dd..00000000 --- a/gym/envs/mit_humanoid/humanoid_bouncing.py +++ /dev/null @@ -1,443 +0,0 @@ -import torch - -from isaacgym.torch_utils import quat_rotate_inverse -from gym.envs import LeggedRobot -from isaacgym import gymutil -from isaacgym import gymapi - - -class HumanoidBouncing(LeggedRobot): - def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): - super().__init__(gym, sim, cfg, sim_params, sim_device, headless) - - def _init_buffers(self): - super()._init_buffers() - - # * get the body_name to body_index dict - body_dict = self.gym.get_actor_rigid_body_dict( - self.envs[0], self.actor_handles[0] - ) - # * extract a list of body_names where the index is the id number - body_names = [ - body_tuple[0] - for body_tuple in sorted( - body_dict.items(), key=lambda body_tuple: body_tuple[1] - ) - ] - # * construct a list of id numbers corresponding to end_effectors - self.end_effector_ids = [] - for end_effector_name in self.cfg.asset.end_effector_names: - self.end_effector_ids.extend( - [ - body_names.index(body_name) - for body_name in body_names - if end_effector_name in body_name - ] - ) - - # * end_effector_pos is world-frame and converted to env_origin - self.end_effector_pos = self._rigid_body_pos[ - :, self.end_effector_ids - ] - self.env_origins.unsqueeze(dim=1).expand( - self.num_envs, len(self.end_effector_ids), 3 - ) - self.end_effector_quat = self._rigid_body_quat[:, self.end_effector_ids] - - self.end_effector_lin_vel = torch.zeros( - self.num_envs, - len(self.end_effector_ids), - 3, - dtype=torch.float, - device=self.device, - ) - self.end_effector_ang_vel = torch.zeros( - self.num_envs, - len(self.end_effector_ids), - 3, - dtype=torch.float, - device=self.device, - ) - - # * end_effector vels are body-relative like body vels above - for index in range(len(self.end_effector_ids)): - self.end_effector_lin_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_lin_vel[:, self.end_effector_ids][:, index, :], - ) - self.end_effector_ang_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_ang_vel[:, self.end_effector_ids][:, index, :], - ) - - # * separate legs and arms - self.dof_pos_target_legs = torch.zeros( - self.num_envs, 10, dtype=torch.float, device=self.device - ) - self.dof_pos_target_arms = torch.zeros( - self.num_envs, 8, dtype=torch.float, device=self.device - ) - self.dof_pos_legs = torch.zeros( - self.num_envs, 10, dtype=torch.float, device=self.device - ) - self.dof_pos_arms = torch.zeros( - self.num_envs, 8, dtype=torch.float, device=self.device - ) - self.dof_vel_legs = torch.zeros( - self.num_envs, 10, dtype=torch.float, device=self.device - ) - self.dof_vel_arms = torch.zeros( - self.num_envs, 8, dtype=torch.float, device=self.device - ) - - # * other - self.base_pos = self.root_states[:, 0:3] - self.phase = torch.zeros( - self.num_envs, 1, dtype=torch.float, device=self.device - ) - self.phase_freq = 1.0 - - # * high level - self.hl_impulses = torch.zeros( - self.num_envs, 4, 5, dtype=torch.float, device=self.device - ) - self.hl_impulses_flat = self.hl_impulses.flatten(start_dim=1) - self.hl_ix = torch.zeros( - self.num_envs, 1, dtype=torch.float, device=self.device - ) - self.hl_commands = torch.zeros( - self.num_envs, 6, dtype=torch.float, device=self.device - ) - self.time_since_hl_query = torch.zeros( - self.num_envs, 1, device=self.device, dtype=torch.float - ) - - def _pre_physics_step(self): - super()._pre_physics_step() - self.dof_pos_target[:, :10] = self.dof_pos_target_legs - self.dof_pos_target[:, 10:] = self.dof_pos_target_arms - - envs_to_resample = torch.where( - self.time_since_hl_query == self.cfg.high_level.interval, - True, - False, - ) - if envs_to_resample.any().item(): - self._resample_high_level( - envs_to_resample.nonzero(as_tuple=False).flatten() - ) - - self.hl_impulses_flat = self.hl_impulses.flatten(start_dim=1) - - def _reset_system(self, env_ids): - super()._reset_system(env_ids) - # if self.cfg.commands.resampling_time == -1: - # self.commands[env_ids, :] = 0. - self.phase[env_ids, 0] = torch.rand( - (torch.numel(env_ids),), requires_grad=False, device=self.device - ) - self._resample_high_level(env_ids) - self.hl_impulses_flat = self.hl_impulses.flatten(start_dim=1) - - def _post_physics_step(self): - super()._post_physics_step() - self.time_since_hl_query += 1 # self.dt - self.phase_sin = torch.sin(2 * torch.pi * self.phase) - self.phase_cos = torch.cos(2 * torch.pi * self.phase) - - self.in_contact = self.contact_forces[:, self.end_effector_ids, 2].gt(0.0) - self.phase = torch.fmod(self.phase + self.dt, 1.0) - - self.dof_pos_legs = self.dof_pos[:, :10] - self.dof_pos_arms = self.dof_pos[:, 10:] - self.dof_vel_legs = self.dof_vel[:, :10] - self.dof_vel_arms = self.dof_vel[:, 10:] - - # * end_effector_pos is world-frame and converted to env_origin - self.end_effector_pos = self._rigid_body_pos[ - :, self.end_effector_ids - ] - self.env_origins.unsqueeze(dim=1).expand( - self.num_envs, len(self.end_effector_ids), 3 - ) - self.end_effector_quat = self._rigid_body_quat[:, self.end_effector_ids] - - # * end_effector vels are body-relative like body vels above - for index in range(len(self.end_effector_ids)): - self.end_effector_lin_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_lin_vel[:, self.end_effector_ids][:, index, :], - ) - self.end_effector_ang_vel[:, index, :] = quat_rotate_inverse( - self.base_quat, - self._rigid_body_ang_vel[:, self.end_effector_ids][:, index, :], - ) - - # * update HL indices - delta_hl_now = self.hl_impulses[:, 0, :] - self.time_since_hl_query.view( - self.num_envs, 1 - ).expand(self.num_envs, 5) - self.hl_ix = torch.argmin( - torch.where( - delta_hl_now >= 0, - delta_hl_now, - float("inf") * torch.ones_like(delta_hl_now), - ), - dim=1, - ).unsqueeze(1) - self._update_hl_commands() - # roll out next time step HL command - self.hl_commands[:, :3] += self.hl_commands[:, 3:] * self.dt - self.hl_commands[:, 5] -= 9.81 * self.dt - - def _resample_high_level(self, env_ids): - """ - Updates impulse sequences for envs its passed s.t. the first impulse compensates for - tracking error and the rest enforce the same bouncing ball trajectory on the COM - """ - self.time_since_hl_query[env_ids] = 0.0 - delta_touchdown = self._time_to_touchdown( - ( - self.root_states[env_ids, 2:3] - - self.cfg.reward_settings.base_height_target - ), - self.base_lin_vel[env_ids, 2].unsqueeze(1), - -9.81, - ) - - step_time = self.cfg.high_level.sec_per_gait - - first_xy = torch.tensor( - [[[1], [0]]], dtype=torch.float, device=self.device - ).repeat(env_ids.shape[0], 1, 1) - ( - self.base_lin_vel[env_ids, :2] * delta_touchdown.repeat(1, 2) - ).unsqueeze(2) - first_z = torch.tensor( - [[[step_time * 9.81 / 2]]], dtype=torch.float, device=self.device - ).repeat(env_ids.shape[0], 1, 1) - ( - self.base_lin_vel[env_ids, 2] - 9.81 * delta_touchdown.squeeze(1) - ).view(env_ids.shape[0], 1, 1) - first_impulse = torch.cat((first_xy, first_z), dim=1) - remaining_impulses = torch.tensor( - [[0], [0], [step_time * 9.81]], dtype=torch.float, device=self.device - ).repeat(env_ids.shape[0], 1, 4) - impulse_mag_buf = torch.cat((first_impulse, remaining_impulses), dim=2) - - time_rollout = torch.cat( - ( - delta_touchdown, - delta_touchdown + (1.0 * step_time), - delta_touchdown + (2.0 * step_time), - delta_touchdown + (3.0 * step_time), - delta_touchdown + (4.0 * step_time), - ), - dim=1, - ).unsqueeze(1) - - time_rollout = time_rollout / self.dt - time_rollout = time_rollout.round().int() - - self.hl_impulses[env_ids] = torch.cat((time_rollout, impulse_mag_buf), dim=1) - self.hl_ix[env_ids] = 0 - - # seed the root states for roll out via post physics - self.hl_commands[env_ids, :3] = self.base_pos[env_ids, :] - self.hl_commands[env_ids, 3:] = self.base_lin_vel[env_ids, :] - - if self.cfg.viewer.record: - # draw new high level target trajectories - self.gym.clear_lines(self.viewer) - sphere_geom = gymutil.WireframeSphereGeometry( - 0.1, 64, 64, None, color=(1, 1, 0) - ) - - for i in range(self.num_envs): - base_pos = (self.root_states[i, :3]).cpu().numpy() - sphere_pose = gymapi.Transform( - gymapi.Vec3(base_pos[0], base_pos[1], base_pos[2]), r=None - ) - gymutil.draw_lines( - sphere_geom, self.gym, self.viewer, self.envs[i], sphere_pose - ) - line = [ - base_pos[0], - base_pos[1], - self.cfg.reward_settings.base_height_target, - base_pos[0] + 100, - base_pos[1], - self.cfg.reward_settings.base_height_target, - ] - self.gym.add_lines(self.viewer, self.envs[i], 1, line, [0.85, 0.1, 0.1]) - - def _time_to_touchdown(self, pos, vel, acc): - """ - Assumes robot COM in projectile motion to calculate next touchdown - """ - determinant = torch.square(vel) - 4 * pos * 0.5 * acc - solution = torch.where(determinant < 0.0, False, True) - no_solution = torch.eq(solution, False).nonzero(as_tuple=True)[0] - t1 = (-vel + torch.sqrt(determinant)) / (2 * 0.5 * acc) - t2 = (-vel - torch.sqrt(determinant)) / (2 * 0.5 * acc) - result = torch.where(t2 > t1, t2, t1) - result[no_solution] = self.dt - return result - - def _update_hl_commands(self): - """ - Checks which environments should have an impulse imparted on them and - updates HL command per-step rollout's velocities accordingly - """ - vel_update_indices = torch.argmax( - (self.hl_impulses[:, 0, :] == self.time_since_hl_query).float(), dim=1 - ).unsqueeze(1) - vel_update_indices[ - (vel_update_indices == 0) - & (self.hl_impulses[:, 0, 0].unsqueeze(1) != self.time_since_hl_query) - ] = -1 - vel_update_indices = vel_update_indices.squeeze(1) - vel_update_envs = torch.nonzero(vel_update_indices != -1) - self.hl_commands[vel_update_envs, 3:] += self.hl_impulses[ - vel_update_envs, 1:, vel_update_indices[vel_update_envs] - ] - - def _check_terminations_and_timeouts(self): - """Check if environments need to be reset""" - super()._check_terminations_and_timeouts() - - # * Termination for velocities, orientation, and low height - self.terminated |= (self.base_lin_vel.norm(dim=-1, keepdim=True) > 10).any( - dim=1 - ) - self.terminated |= (self.base_ang_vel.norm(dim=-1, keepdim=True) > 5).any(dim=1) - self.terminated |= (self.projected_gravity[:, 0:1].abs() > 0.7).any(dim=1) - self.terminated |= (self.projected_gravity[:, 1:2].abs() > 0.7).any(dim=1) - self.terminated |= (self.base_pos[:, 2:3] < 0.3).any(dim=1) - - self.to_be_reset = self.timed_out | self.terminated - - # ########################## REWARDS ######################## # - - # * Task rewards * # - - def _reward_tracking_hl_pos(self): - # error = self.hl_commands[:, 2] - self.base_pos[:, 2] - # error /= self.scales["hl_pos"] - # error = error.flatten() - # return self._sqrdexp(error) - error = self.hl_commands[:, :3] - self.base_pos[:, :3] - error /= self.scales["hl_pos"] - return self._sqrdexp(error).sum(dim=1) - - def _reward_tracking_hl_vel(self): - error = self.hl_commands[:, 3:] - self.base_lin_vel - error /= self.scales["hl_vel"] - return self._sqrdexp(error).sum(dim=1) - - def _reward_tracking_ang_vel(self): - ang_vel_error = 0.0 - self.base_ang_vel[:, 2] - ang_vel_error /= self.scales["base_ang_vel"] - return self._sqrdexp(ang_vel_error) - - # * Shaping rewards * # - - def _reward_base_height(self): - error = self.base_height - self.cfg.reward_settings.base_height_target - error /= self.scales["base_height"] - error = error.flatten() - return self._sqrdexp(error) - - def _reward_orientation(self): - return self._sqrdexp(self.projected_gravity[:, 2] + 1) - - def _reward_joint_regularization_legs(self): - # * Reward joint poses and symmetry - reward = self._reward_hip_yaw_zero() - reward += self._reward_hip_abad_symmetry() - reward += self._reward_hip_pitch_symmetry() - return reward / 3.0 - - def _reward_hip_yaw_zero(self): - error = self.dof_pos[:, 0] - self.default_dof_pos[:, 0] - reward = self._sqrdexp(error / self.scales["dof_pos"][0]) / 2.0 - error = self.dof_pos[:, 5] - self.default_dof_pos[:, 5] - reward += self._sqrdexp(error / self.scales["dof_pos"][5]) / 2.0 - return reward - - def _reward_hip_abad_symmetry(self): - error = ( - self.dof_pos[:, 1] / self.scales["dof_pos"][1] - - self.dof_pos[:, 6] / self.scales["dof_pos"][6] - ) - return self._sqrdexp(error) - - def _reward_hip_pitch_symmetry(self): - error = ( - self.dof_pos[:, 2] / self.scales["dof_pos"][2] - + self.dof_pos[:, 7] / self.scales["dof_pos"][7] - ) - return self._sqrdexp(error) - - def _reward_joint_regularization_arms(self): - reward = 0 - reward += self._reward_arm_yaw_symmetry() - reward += self._reward_arm_yaw_zero() - reward += self._reward_arm_abad_zero() - reward += self._reward_arm_abad_symmetry() - reward += self._reward_arm_pitch_symmetry() - reward += self._reward_arm_pitch_zero() - reward += self._reward_elbow_zero() - return reward / 6.0 - - def _reward_arm_pitch_symmetry(self): - error = ( - self.dof_pos[:, 10] / self.scales["dof_pos"][10] - + self.dof_pos[:, 14] / self.scales["dof_pos"][14] - ) - return self._sqrdexp(error) - - def _reward_arm_pitch_zero(self): - error = self.dof_pos[:, 10] - self.default_dof_pos[:, 10] - reward = self._sqrdexp(error / self.scales["dof_pos"][10]) - error = self.dof_pos[:, 14] - self.default_dof_pos[:, 14] - reward += self._sqrdexp(error / self.scales["dof_pos"][14]) - return reward / 2.0 - - def _reward_elbow_symmetry(self): - error = ( - self.dof_pos[:, 13] / self.scales["dof_pos"][13] - + self.dof_pos[:, 17] / self.scales["dof_pos"][17] - ) - return self._sqrdexp(error) - - def _reward_elbow_zero(self): - error = self.dof_pos[:, 13] - self.default_dof_pos[:, 13] - reward = self._sqrdexp(error / self.scales["dof_pos"][13]) - error = self.dof_pos[:, 17] - self.default_dof_pos[:, 17] - reward += self._sqrdexp(error / self.scales["dof_pos"][17]) - return reward / 2.0 - - def _reward_arm_yaw_symmetry(self): - error = ( - self.dof_pos[:, 12] / self.scales["dof_pos"][12] - - self.dof_pos[:, 16] / self.scales["dof_pos"][16] - ) - return self._sqrdexp(error) - - def _reward_arm_yaw_zero(self): - error = self.dof_pos[:, 12] - self.default_dof_pos[:, 12] - reward = self._sqrdexp(error / self.scales["dof_pos"][12]) - error = self.dof_pos[:, 16] - self.default_dof_pos[:, 16] - reward += self._sqrdexp(error / self.scales["dof_pos"][16]) - return reward / 2.0 - - def _reward_arm_abad_symmetry(self): - error = ( - self.dof_pos[:, 11] / self.scales["dof_pos"][11] - - self.dof_pos[:, 15] / self.scales["dof_pos"][15] - ) - return self._sqrdexp(error) - - def _reward_arm_abad_zero(self): - error = self.dof_pos[:, 11] - self.default_dof_pos[:, 11] - reward = self._sqrdexp(error / self.scales["dof_pos"][11]) - error = self.dof_pos[:, 15] - self.default_dof_pos[:, 15] - reward += self._sqrdexp(error / self.scales["dof_pos"][15]) - return reward / 2.0 diff --git a/gym/envs/mit_humanoid/humanoid_bouncing_config.py b/gym/envs/mit_humanoid/humanoid_bouncing_config.py deleted file mode 100644 index ee123ad0..00000000 --- a/gym/envs/mit_humanoid/humanoid_bouncing_config.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -Configuration file for "fixed arm" (FA) humanoid environment -with potential-based rewards implemented -""" - -import torch -from gym.envs.base.legged_robot_config import LeggedRobotCfg, LeggedRobotRunnerCfg - - -class HumanoidBouncingCfg(LeggedRobotCfg): - class env(LeggedRobotCfg.env): - num_envs = 4096 - num_actuators = 18 - episode_length_s = 5 # 100 - - class terrain(LeggedRobotCfg.terrain): - curriculum = False - mesh_type = "plane" - measure_heights = False - - class init_state(LeggedRobotCfg.init_state): - reset_mode = "reset_to_range" - pos = [0.0, 0.0, 0.75] # x,y,z [m] - rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] - lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] - ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] - - default_joint_angles = { - "hip_yaw": 0.0, - "hip_abad": 0.0, - "hip_pitch": -0.2, - "knee": 0.25, # 0.6 - "ankle": 0.0, - "shoulder_pitch": 0.0, - "shoulder_abad": 0.0, - "shoulder_yaw": 0.0, - "elbow": -1.25, - } - - # ranges for [x, y, z, roll, pitch, yaw] - root_pos_range = [ - [0.0, 0.0], # x - [0.0, 0.0], # y - [0.7, 0.72], # z - [-torch.pi / 10, torch.pi / 10], # roll - [-torch.pi / 10, torch.pi / 10], # pitch - [-torch.pi / 10, torch.pi / 10], # yaw - ] - - # ranges for [v_x, v_y, v_z, w_x, w_y, w_z] - root_vel_range = [ - [-0.5, 2.5], # x - [-0.5, 0.5], # y - [-0.5, 0.5], # z - [-0.5, 0.5], # roll - [-0.5, 0.5], # pitch - [-0.5, 0.5], # yaw - ] - - dof_pos_range = { - "hip_yaw": [-0.1, 0.1], - "hip_abad": [-0.2, 0.2], - "hip_pitch": [-0.2, 0.2], - "knee": [0.6, 0.7], - "ankle": [-0.3, 0.3], - "shoulder_pitch": [0.0, 0.0], - "shoulder_abad": [0.0, 0.0], - "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], - } - - dof_vel_range = { - "hip_yaw": [-0.1, 0.1], - "hip_abad": [-0.1, 0.1], - "hip_pitch": [-0.1, 0.1], - "knee": [-0.1, 0.1], - "ankle": [-0.1, 0.1], - "shoulder_pitch": [0.0, 0.0], - "shoulder_abad": [0.0, 0.0], - "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], - } - - class control(LeggedRobotCfg.control): - # stiffness and damping for joints - stiffness = { - "hip_yaw": 30.0, - "hip_abad": 30.0, - "hip_pitch": 30.0, - "knee": 30.0, - "ankle": 30.0, - "shoulder_pitch": 30.0, - "shoulder_abad": 30.0, - "shoulder_yaw": 30.0, - "elbow": 50.0, - } # [N*m/rad] - damping = { - "hip_yaw": 5.0, - "hip_abad": 5.0, - "hip_pitch": 5.0, - "knee": 5.0, - "ankle": 5.0, - "shoulder_pitch": 5.0, - "shoulder_abad": 5.0, - "shoulder_yaw": 5.0, - "elbow": 1.0, - } # [N*m*s/rad] - ctrl_frequency = 100 - desired_sim_frequency = 800 - - class commands(LeggedRobotCfg.commands): - resampling_time = 5.0 - - class ranges: - # TRAINING COMMAND RANGES # - lin_vel_x = [0, 4.5] # min max [m/s] - lin_vel_y = 0.75 # min max [m/s] - yaw_vel = 4.0 # min max [rad/s] - # # PLAY COMMAND RANGES # - # lin_vel_x = [3., 3.] # min max [m/s] - # lin_vel_y = 0. # min max [m/s] - # yaw_vel = 0. # min max [rad/s] - - class high_level: - sec_per_gait = 0.25 - interval = (int)(sec_per_gait * (5.0 / 2.0) * 100) # time steps - - class push_robots: - toggle = True - interval_s = 2.5 - max_push_vel_xy = 0.5 - push_box_dims = [0.1, 0.2, 0.3] # x,y,z [m] - - class domain_rand(LeggedRobotCfg.domain_rand): - randomize_friction = False - friction_range = [0.5, 1.25] - - randomize_base_mass = False - added_mass_range = [-1.0, 1.0] - - class asset(LeggedRobotCfg.asset): - # file = ('{LEGGED_GYM_ROOT_DIR}/resources/robots/rom/urdf/' - # +'humanoid_fixed_arms_full.urdf') - file = ( - "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_F_sf.urdf" - ) - keypoints = ["base"] - end_effectors = ["left_foot", "right_foot"] - # end_effector_names = ['left_toe', 'left_heel', - # 'right_toe', 'right_heel'] - foot_name = "foot" - terminate_after_contacts_on = [ - "base", - "left_upper_leg", - "left_lower_leg", - "right_upper_leg", - "right_lower_leg", - "left_upper_arm", - "right_upper_arm", - "left_lower_arm", - "right_lower_arm", - "left_hand", - "right_hand", - ] - - fix_base_link = False - disable_gravity = False - - disable_motors = False - - # (1: disable, 0: enable...bitwise filter) - self_collisions = 0 - collapse_fixed_joints = False - flip_visual_attachments = False - - # Check GymDofDriveModeFlags - # (0: none, 1: pos tgt, 2: vel target, 3: effort) - default_dof_drive_mode = 3 - - class reward_settings(LeggedRobotCfg.reward_settings): - soft_dof_pos_limit = 0.9 - soft_dof_vel_limit = 0.9 - soft_torque_limit = 0.8 - max_contact_force = 1500.0 - - # negative total rewards clipped at zero (avoids early termination) - only_positive_rewards = False # ! zap? - base_height_target = 0.6565 - tracking_sigma = 0.5 - - class scaling(LeggedRobotCfg.scaling): - hl_pos = 0.05 # 0.3 # maximum altitude for current bouncing ball param - hl_vel = 2.0 # 4.905 # max impulse imparted by HL - base_height = 0.6565 - base_lin_vel = 1.0 - base_ang_vel = torch.pi - dof_pos = 2 * [0.5, 1, 3, 2, 2] + 2 * [2, 1, 0.5, 2.0] - dof_vel = 1.0 - dof_pos_target = dof_pos - clip_actions = 1000.0 - - -class HumanoidBouncingRunnerCfg(LeggedRobotRunnerCfg): - do_wandb = True - seed = -1 - - class policy(LeggedRobotRunnerCfg.policy): - init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] - # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) - activation = "elu" - normalize_obs = True # True, False - - actor_obs = [ - "base_height", - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "hl_commands", - # "hl_impulses_flat", - # "commands", - # "phase_sin", - # "phase_cos", - # "time_since_hl_query", - "dof_pos_legs", - "dof_vel_legs", - "in_contact", - ] - - critic_obs = actor_obs - - actions = ["dof_pos_target_legs"] - disable_actions = False - - add_noise = True - noise_level = 1.0 # scales other values - - class noise: - base_height = 0.05 - base_lin_vel = 0.1 - base_ang_vel = 0.05 - projected_gravity = 0.05 - dof_pos = 0.005 - dof_vel = 0.01 - in_contact = 0.1 - - class reward: - class weights: - # * Behavioral rewards * # - action_rate = 1.0e-3 - action_rate2 = 1.0e-4 - # tracking_lin_vel = 5.0 - tracking_hl_pos = 5.0 - tracking_hl_vel = 5.0 - tracking_ang_vel = 5.0 - torques = 1e-4 - # dof_pos_limits = 10 - torque_limits = 1e-2 - dof_vel = 1e-4 - - # * Shaping rewards * # - base_height = 0 # 0.1 - orientation = 1.0 - hip_yaw_zero = 2.0 - hip_abad_symmetry = 0.2 - - class termination_weight: - termination = 1 - - class algorithm(LeggedRobotRunnerCfg.algorithm): - # algorithm training hyperparameters - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - num_mini_batches = 4 # minibatch size = num_envs*nsteps/nminibatches - learning_rate = 1.0e-5 - schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 - lam = 0.99 - desired_kl = 0.01 - max_grad_norm = 1.0 - - class runner(LeggedRobotRunnerCfg.runner): - policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 32 - max_iterations = 1500 - run_name = "" - experiment_name = "HumanoidTrajectoryTracking" - save_interval = 50 - plot_input_gradients = False - plot_parameter_gradients = False diff --git a/learning/modules/lqrc/custom_nn.py b/learning/modules/lqrc/custom_nn.py index ee9c9ff0..0d56c658 100644 --- a/learning/modules/lqrc/custom_nn.py +++ b/learning/modules/lqrc/custom_nn.py @@ -170,21 +170,24 @@ def forward(self, x): output = self.activation_2(output) output = self.connection_3(output) output = self.activation_3(output) - C = self.create_cholesky(output[:, :self.L_indices]) + C = self.create_cholesky(output[:, : self.L_indices]) A = C.bmm(C.transpose(1, 2)) - offset = output[:, self.L_indices:] + offset = output[:, self.L_indices :] x_bar = x - offset - y_pred = (x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2))).squeeze(2) + y_pred = ( + x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2)) + ).squeeze(2) return y_pred def save_intermediate(self): """ Forward hook to save A and offset """ + def hook(module, input, output): - C = self.create_cholesky(output[:, :self.L_indices]) + C = self.create_cholesky(output[:, : self.L_indices]) A = C.bmm(C.transpose(1, 2)) - offset = output[:, self.L_indices:] + offset = output[:, self.L_indices :] self.intermediates["A"] = A self.intermediates["offset"] = offset @@ -207,13 +210,16 @@ def forward(self, x): output = self.activation_3(output) C = self.create_cholesky(output) A = C.bmm(C.transpose(1, 2)) - y_pred = (xhat.unsqueeze(2).transpose(1, 2).bmm(A).bmm(xhat.unsqueeze(2))).squeeze(2) + y_pred = ( + xhat.unsqueeze(2).transpose(1, 2).bmm(A).bmm(xhat.unsqueeze(2)) + ).squeeze(2) return y_pred def save_intermediate(self): """ Forward hook to save A """ + def hook(module, input, output): C = self.create_cholesky(output) A = C.bmm(C.transpose(1, 2)) @@ -225,6 +231,7 @@ def save_xhat(self): """ Forward hook to save xhat for debugging """ + def hook(module, input, output): self.intermediates["xhat"] = output diff --git a/learning/modules/lqrc/evaluate_critic.py b/learning/modules/lqrc/evaluate_critic.py index f4401900..d9eb4023 100644 --- a/learning/modules/lqrc/evaluate_critic.py +++ b/learning/modules/lqrc/evaluate_critic.py @@ -1,6 +1,6 @@ import os -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from utils import critic_eval_args, get_load_path from plotting import ( @@ -68,7 +68,7 @@ def model_switch(args): A_pred.append(model.NN.intermediates["A"]) c_pred.append(model.NN.intermediates["c"]) - save_path = os.path.join(LEGGED_GYM_LQRC_DIR, "logs") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc") if not os.path.exists(save_path): os.makedirs(save_path) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index aa8bf02a..2f60562c 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -2,7 +2,7 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np -from matplotlib.colors import CenteredNorm, TwoSlopeNorm +from matplotlib.colors import CenteredNorm matplotlib.rcParams["pdf.fonttype"] = 42 matplotlib.rcParams["ps.fonttype"] = 42 @@ -207,7 +207,11 @@ def plot_predictions_and_gradients( y_pred = y_pred.detach().cpu().numpy() y_actual = y_actual.detach().cpu().numpy() if actual_grad is not None: - actual_grad = actual_grad.detach().cpu().numpy() if not isinstance(actual_grad, np.ndarray) else actual_grad + actual_grad = ( + actual_grad.detach().cpu().numpy() + if not isinstance(actual_grad, np.ndarray) + else actual_grad + ) if colormap_diff: fig, (ax1, ax2) = plt.subplots( nrows=1, ncols=2, figsize=(16, 8), layout="tight" @@ -224,7 +228,8 @@ def plot_predictions_and_gradients( axis=1, ).reshape(sq_len, sq_len), cmap="OrRd", - vmin=0.0, vmax=30.0, + vmin=0.0, + vmax=30.0, ) plt.colorbar(img1, ax=ax1) img2 = graph_3D_helper(ax2)( @@ -232,7 +237,8 @@ def plot_predictions_and_gradients( x_actual[:, 1].reshape(sq_len, sq_len), (y_pred - y_actual).reshape(sq_len, sq_len), cmap="bwr", - vmin=-2.0, vmax=2.0 + vmin=-2.0, + vmax=2.0, ) plt.colorbar(img2, ax=ax2) set_titles_labels( @@ -252,7 +258,7 @@ def plot_predictions_and_gradients( c=y_pred, cmap="viridis", marker="o", - alpha=0.7 + alpha=0.7, ) scatter = axes[1].scatter( x_actual[:, 0], @@ -260,7 +266,7 @@ def plot_predictions_and_gradients( c=y_actual, cmap="viridis", marker="^", - alpha=0.7 + alpha=0.7, ) fig.colorbar(scatter, ax=axes.ravel().tolist(), shrink=0.95, label="f(x)") axes[0].set_title("Pointwise Predictions") @@ -296,7 +302,7 @@ def plot_predictions_and_gradients( cmap="bwr", marker="o", alpha=0.5, - norm=CenteredNorm() + norm=CenteredNorm(), ) fig.colorbar(scatter, ax=axis, shrink=0.95, label="f(x)") axis.set_title("Error Between Pointwise Predictions and Targets") diff --git a/learning/modules/lqrc/run_plot.py b/learning/modules/lqrc/run_plot.py index a5eff73a..9a9b7f24 100644 --- a/learning/modules/lqrc/run_plot.py +++ b/learning/modules/lqrc/run_plot.py @@ -1,12 +1,12 @@ import os -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from learning.modules.lqrc.plotting import plot_training_data_dist read_path = "CholeskyPlusConst_fixed_mean_var_all_obs.npy" output_path = "CholeskyPlusConst_fixed_mean_var_data_dist_1_redo.png" -save_path = os.path.join(LEGGED_GYM_LQRC_DIR, "logs") +save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc") if __name__ == "__main__": plot_training_data_dist( - LEGGED_GYM_LQRC_DIR + f"/{read_path}", save_path + f"/{output_path}" + LEGGED_GYM_ROOT_DIR + f"/{read_path}", save_path + f"/{output_path}" ) diff --git a/learning/modules/lqrc/train_benchmark.py b/learning/modules/lqrc/train_benchmark.py index a321c208..51000092 100644 --- a/learning/modules/lqrc/train_benchmark.py +++ b/learning/modules/lqrc/train_benchmark.py @@ -16,7 +16,7 @@ CholeskyOffset1, CholeskyOffset2, ) -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from utils import benchmark_args from plotting import ( plot_loss, @@ -105,20 +105,23 @@ def bound_function_smoothly(y): def load_ground_truth(*args, **kwargs): - fn = f"{LEGGED_GYM_LQRC_DIR}/logs/ground_truth.npy" + fn = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/ground_truth.npy" data = torch.from_numpy(np.load(fn)).to(DEVICE).type(torch.float32) return data[:, :-1], data[:, -1].unsqueeze(1) def ground_truth_numerial_gradients(): - fn = f"{LEGGED_GYM_LQRC_DIR}/logs/ground_truth.npy" + fn = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/ground_truth.npy" data = np.load(fn) sq_size = int(np.sqrt(data.shape[0]).item()) gridded_data = data[:, -1].reshape((sq_size, sq_size)) h_theta = abs(data[0, 0] - data[sq_size, 0]).item() h_omega = abs(data[0, 1] - data[1, 1]).item() theta_grad, theta_omega = np.gradient(gridded_data, h_theta, h_omega) - return np.concatenate((theta_grad.flatten().reshape(-1, 1), theta_omega.flatten().reshape(-1, 1)), axis=1) + return np.concatenate( + (theta_grad.flatten().reshape(-1, 1), theta_omega.flatten().reshape(-1, 1)), + axis=1, + ) def model_switch(input_dim, model_name=None): @@ -134,13 +137,20 @@ def model_switch(input_dim, model_name=None): num_obs = input_dim hidden_dims = [128, 64, 32] activation = "tanh" - return create_MLP(num_obs, 1, hidden_dims, activation).to(DEVICE), torch.nn.MSELoss(reduction="mean") + return create_MLP(num_obs, 1, hidden_dims, activation).to( + DEVICE + ), torch.nn.MSELoss(reduction="mean") elif model_name == "CholeskyOffset1": - return CholeskyOffset1(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss(reduction="mean") + return CholeskyOffset1(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( + reduction="mean" + ) elif model_name == "CholeskyOffset2": - return CholeskyOffset2(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss(reduction="mean") + return CholeskyOffset2(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( + reduction="mean" + ) return BaselineMLP(input_dim, device=DEVICE).to(DEVICE), torch.nn.MSELoss( - reduction="mean") + reduction="mean" + ) def test_case_switch(case_name=None): @@ -212,7 +222,11 @@ def gradient_switch(test_case, **kwargs): for X_batch, y_batch in train_dataloader: X_batch.requires_grad_() y_pred = model(X_batch) - loss = loss_fn(y_pred, y_batch, intermediates=model.intermediates) if model_type == "CholeskyPlusConst" else loss_fn(y_pred, y_batch) + loss = ( + loss_fn(y_pred, y_batch, intermediates=model.intermediates) + if model_type == "CholeskyPlusConst" + else loss_fn(y_pred, y_batch) + ) loss_per_batch.append(loss.item()) optimizer.zero_grad() loss.backward() @@ -255,7 +269,11 @@ def gradient_switch(test_case, **kwargs): X_batch.requires_grad_() X_batch = X_batch + 0 y_pred = model(X_batch) - loss = loss_fn(y_pred, y_batch, intermediates=model.intermediates) if model_type == "CholeskyPlusConst" else loss_fn(y_pred, y_batch) + loss = ( + loss_fn(y_pred, y_batch, intermediates=model.intermediates) + if model_type == "CholeskyPlusConst" + else loss_fn(y_pred, y_batch) + ) if model_type == "QuadraticNetCholesky": # ! do we want this anymore? A_pred = model.intermediates["A"] @@ -283,7 +301,7 @@ def gradient_switch(test_case, **kwargs): print(f"Total training time: {timer.get_time('total training')}") time_str = time.strftime("%Y%m%d_%H%M%S") - save_path = os.path.join(LEGGED_GYM_LQRC_DIR, "logs", save_str) + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", save_str) if not os.path.exists(save_path): os.makedirs(save_path) if save_model: @@ -296,7 +314,7 @@ def gradient_switch(test_case, **kwargs): params=dict(model.named_parameters()), show_attrs=True, show_saved=True, - ).render(f"{LEGGED_GYM_LQRC_DIR}/logs/model_viz", format="png") + ).render(f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/model_viz", format="png") print("Saving to", save_path) grad_kwargs = {"all_inputs": all_inputs} if test_case == "rosenbrock" else {} diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index 11be1b14..5623b80c 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -89,8 +89,6 @@ def forward(self, input, mask=None): current_mean = self.running_mean current_var = self.running_var - # current_mean = torch.tensor([torch.pi, 0.0], device="cuda:0") - # current_var = torch.tensor([((200.0)**2.0/12.0), ((2.0*torch.pi)/12.0)], device="cuda:0") y = (input - current_mean.float()) / torch.sqrt( current_var.float() + self.epsilon diff --git a/learning/runners/critic_only_runner.py b/learning/runners/critic_only_runner.py index 4c523e37..8a21f661 100644 --- a/learning/runners/critic_only_runner.py +++ b/learning/runners/critic_only_runner.py @@ -5,5 +5,4 @@ class CriticOnlyRunner(DataLoggingRunner): def load(self, path): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict( - loaded_dict["actor_state_dict"]) + self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) diff --git a/learning/runners/data_logging_runner.py b/learning/runners/data_logging_runner.py index 4bca88bc..db766c31 100644 --- a/learning/runners/data_logging_runner.py +++ b/learning/runners/data_logging_runner.py @@ -1,7 +1,7 @@ import torch import numpy as np import os -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from learning.utils import Logger @@ -78,7 +78,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") @@ -89,8 +89,9 @@ def learn(self): self.save() self.all_obs = self.all_obs.detach().cpu().numpy() save_path = os.path.join( - LEGGED_GYM_LQRC_DIR, + LEGGED_GYM_ROOT_DIR, "logs", + "lqrc", "standard_training_data.npy" if self.policy_cfg["standard_critic_nn"] else "custom_training_data.npy", diff --git a/scripts/lqrc/brute_force_value_function.py b/scripts/lqrc/brute_force_value_function.py index 3463e2d6..17b5a6a1 100644 --- a/scripts/lqrc/brute_force_value_function.py +++ b/scripts/lqrc/brute_force_value_function.py @@ -4,7 +4,6 @@ from gym.envs import __init__ # noqa: F401 from gym import LEGGED_GYM_ROOT_DIR from gym.utils import get_args, task_registry -from learning import LEGGED_GYM_LQRC_DIR from learning.modules import Critic from learning.modules.lqrc.utils import get_load_path from learning.modules.lqrc.plotting import ( @@ -153,9 +152,9 @@ def query_value_function(vf_args, grid): DEVICE = "cuda:0" steps = 50 npy_fn = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_training_data.npy" + f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/custom_training_data.npy" if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_training_data.npy" + else f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/standard_training_data.npy" ) data = np.load(npy_fn) # ! data is normalized @@ -167,7 +166,7 @@ def query_value_function(vf_args, grid): EXPORT_POLICY = False time_str = time.strftime("%b%d_%H-%M-%S") - save_path = os.path.join(LEGGED_GYM_LQRC_DIR, f"logs/{time_str}") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, f"logs/{time_str}") if not os.path.exists(save_path): os.makedirs(save_path) # get ground truth @@ -176,13 +175,12 @@ def query_value_function(vf_args, grid): ground_truth_returns, logs = get_ground_truth(env, runner, train_cfg, grid) grid_gt_combined = np.hstack((grid.detach().cpu().numpy(), ground_truth_returns.T)) - gt_save_path = f"{LEGGED_GYM_LQRC_DIR}/logs/ground_truth.npy" + gt_save_path = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/ground_truth.npy" np.save(gt_save_path, grid_gt_combined) print("Saved high returns to", gt_save_path) exit() - # logs = {k: v.detach().cpu().numpy() for k, v in logs.items()} # np.savez_compressed(os.path.join(save_path, "data.npz"), **logs) # print("saved logs to", os.path.join(save_path, "data.npz")) @@ -199,9 +197,9 @@ def query_value_function(vf_args, grid): # ) # high_gt_returns = np.array(high_gt_returns) # returns_save_path = ( - # f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" + # f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/custom_high_returns.npy" # if args.custom_critic - # else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" + # else f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/standard_high_returns.npy" # ) # np.save(returns_save_path, high_gt_returns) # print("Saved high returns to", returns_save_path) @@ -241,5 +239,4 @@ def query_value_function(vf_args, grid): ) # ! store data dist with model logs to ensure they're paired properly - plot_training_data_dist(npy_fn, - save_path + "/data_distribution.png") + plot_training_data_dist(npy_fn, save_path + "/data_distribution.png") diff --git a/scripts/lqrc/rollout_init_conditions.py b/scripts/lqrc/rollout_init_conditions.py index 4209abab..b408f81a 100644 --- a/scripts/lqrc/rollout_init_conditions.py +++ b/scripts/lqrc/rollout_init_conditions.py @@ -7,7 +7,7 @@ # torch needs to be imported after isaacgym imports in local source import torch -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR from learning.modules.lqrc.plotting import plot_trajectories @@ -89,9 +89,9 @@ def play(env, runner, train_cfg, init_conditions): args = get_args() DEVICE = "cuda:0" npy_fn = ( - f"{LEGGED_GYM_LQRC_DIR}/logs/custom_high_returns.npy" + f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/custom_high_returns.npy" if args.custom_critic - else f"{LEGGED_GYM_LQRC_DIR}/logs/standard_high_returns.npy" + else f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/standard_high_returns.npy" ) init_conditions = np.array([[0.25, -0.75, 0.2, -0.7]]) # np.load(npy_fn) num_init_conditions = 2 @@ -100,13 +100,16 @@ def play(env, runner, train_cfg, init_conditions): pos_traj, vel_traj, torques, rewards = play( env, runner, train_cfg, init_conditions ) + + high = np.hstack((init_conditions[0, 0], init_conditions[0, 2])) + low = np.hstack((init_conditions[0, 1], init_conditions[0, 3])) + title = f"High Return Init {high}, Low Return Init {low}" + plot_trajectories( pos_traj, vel_traj, torques, rewards, - f"{LEGGED_GYM_LQRC_DIR}/logs/trajectories.png", - title=f"High Return Init (theta, omega) {np.hstack((init_conditions[0, 0], init_conditions[0, 2]))}, Low Return Init {np.hstack((init_conditions[0, 1], init_conditions[0, 3]))}", + f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/trajectories.png", + title=title, ) - # plot_theta_omega_polar(np.vstack((init_conditions[:, 0], init_conditions[:, 2])), np.vstack((init_conditions[:, 1], init_conditions[:, 3])), npy_fn[:-4] + ".png") - # print(init_conditions[0, :]) diff --git a/scripts/lqrc/simple_plots.py b/scripts/lqrc/simple_plots.py index 06b30c98..846af7b2 100644 --- a/scripts/lqrc/simple_plots.py +++ b/scripts/lqrc/simple_plots.py @@ -1,6 +1,6 @@ import numpy as np import matplotlib.pyplot as plt -from learning import LEGGED_GYM_LQRC_DIR +from learning import LEGGED_GYM_ROOT_DIR import matplotlib.collections as mcoll @@ -202,13 +202,13 @@ def plot_initial_conditions_colored_by_return( if __name__ == "__main__": experiment = "Feb23_14-33-51" - file_path = f"{LEGGED_GYM_LQRC_DIR}/logs/{experiment}/data.npz" + file_path = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/{experiment}/data.npz" data = np.load(file_path) rewards_data = {key: data[key] for key in data.files if key[:2] == "r_"} data = {key: data[key] for key in data.files if key[:2] != "r_"} - save_path = f"{LEGGED_GYM_LQRC_DIR}/logs/{experiment}" + save_path = f"{LEGGED_GYM_ROOT_DIR}/logs/lqrc/{experiment}" returns = calculate_cumulative_returns(rewards_data) # setting colormap outside, so that all returns-related plots use the same From 49c7be7ffee22ecb4adc43aa8991d38cfcf8347b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 13 Mar 2024 14:59:49 -0400 Subject: [PATCH 20/69] WIP: splitting actor_critic --- gym/envs/pendulum/pendulum_config.py | 6 ++- .../logging_and_saving/wandb_singleton.py | 2 +- learning/algorithms/ppo2.py | 12 +++--- learning/modules/actor.py | 4 +- learning/modules/actor_critic.py | 40 ++----------------- learning/modules/critic.py | 4 +- learning/modules/utils/normalize.py | 2 +- learning/runners/BaseRunner.py | 22 +++++----- learning/runners/my_runner.py | 11 ++--- learning/runners/on_policy_runner.py | 26 +++++------- 10 files changed, 45 insertions(+), 84 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index e74212d4..5c92c448 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -61,7 +61,6 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): class policy(FixedRobotCfgPPO.policy): actor_hidden_dims = [128, 64, 32] - critic_hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" @@ -111,6 +110,11 @@ class algorithm(FixedRobotCfgPPO.algorithm): standard_loss = True plus_c_penalty = 0.1 + class critic: + standard_nn = True + hidden_dims = [128, 64, 32] + activation = "tanh" + class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" diff --git a/gym/utils/logging_and_saving/wandb_singleton.py b/gym/utils/logging_and_saving/wandb_singleton.py index a1b45b8a..bba14fea 100644 --- a/gym/utils/logging_and_saving/wandb_singleton.py +++ b/gym/utils/logging_and_saving/wandb_singleton.py @@ -7,7 +7,7 @@ class WandbSingleton(object): def __new__(self): if not hasattr(self, "instance"): - self.instance = super(WandbSingleton, self).__new__(self) + self.instance = super().__new__(self) self.entity_name = None self.project_name = None self.experiment_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 69d6dcd2..d0d40627 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -13,7 +13,8 @@ class PPO2: def __init__( self, - actor_critic, + actor, + critic, num_learning_epochs=1, num_mini_batches=1, clip_param=0.2, @@ -36,11 +37,8 @@ def __init__( # * PPO components # todo keeping actor_critic for loading code in runner - self.actor_critic = actor_critic.to(self.device) - self.actor = actor_critic.actor.to(self.device) - self.critic = actor_critic.critic.to(self.device) - self.storage = None # initialized later - # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.actor = actor.to(self.device) + self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) @@ -58,7 +56,7 @@ def test_mode(self): self.actor.test() self.critic.test() - def train_mode(self): + def switch_to_train(self): self.actor.train() self.critic.train() diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 7f40d502..e4ecce75 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -11,7 +11,7 @@ def __init__( self, num_obs, num_actions, - hidden_dims, + actor_hidden_dims, activation="elu", init_noise_std=1.0, normalize_obs=True, @@ -30,7 +30,7 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.NN = create_MLP(num_obs, num_actions, actor_hidden_dims, activation) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 157ba38b..cec8cbb5 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,49 +1,17 @@ import torch.nn as nn -from .actor import Actor -from .critic import Critic - class ActorCritic(nn.Module): - def __init__( - self, - num_actor_obs, - num_critic_obs, - num_actions, - actor_hidden_dims=[256, 256, 256], - critic_hidden_dims=[256, 256, 256], - activation="elu", - init_noise_std=1.0, - normalize_obs=True, - standard_critic_nn=True, - **kwargs, - ): + def __init__(self, actor, critic, **kwargs): if kwargs: print( "ActorCritic.__init__ got unexpected arguments, " "which will be ignored: " + str([key for key in kwargs.keys()]) ) - super(ActorCritic, self).__init__() - - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) - - self.critic = Critic( - num_critic_obs, - critic_hidden_dims, - activation, - normalize_obs, - standard_critic_nn, - ) + super().__init__() - print(f"Actor MLP: {self.actor.NN}") - print(f"Critic MLP: {self.critic.NN}") + self.actor = actor + self.critic = critic @property def action_mean(self): diff --git a/learning/modules/critic.py b/learning/modules/critic.py index ea9e552d..4a7e1c12 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -10,7 +10,7 @@ class Critic(nn.Module): def __init__( self, num_obs, - hidden_dims=None, + critic_hidden_dims=None, activation="elu", normalize_obs=True, standard_nn=True, @@ -24,7 +24,7 @@ def __init__( super().__init__() self.NN = ( - create_MLP(num_obs, 1, hidden_dims, activation) + create_MLP(num_obs, 1, critic_hidden_dims, activation) if standard_nn else CholeskyPlusConst(num_obs) ) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index d00b69b5..246bafa4 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -4,7 +4,7 @@ class RunningMeanStd(nn.Module): def __init__(self, num_items, epsilon=1e-05): - super(RunningMeanStd, self).__init__() + super().__init__() self.num_items = num_items self.epsilon = epsilon diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index d41e7f07..6727c13a 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import ActorCritic +from learning.modules import Actor, Critic from learning.utils import remove_zero_weighted_rewards @@ -10,22 +10,22 @@ def __init__(self, env, train_cfg, device="cpu"): self.env = env self.parse_train_cfg(train_cfg) - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **self.policy_cfg - ).to(self.device) - - alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 self.log_dir = train_cfg["log_dir"] + self._set_up_alg() + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) + num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) + num_actions = self.get_action_size(self.policy_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **self.policy_cfg) + critic = Critic(num_critic_obs, **self.policy_cfg) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index cdd30400..7cc65441 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -24,11 +24,10 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() + self.alg.switch_to_train() actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - rewards_dict self.save() # * start up storage @@ -124,10 +123,6 @@ def set_up_logger(self): logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index d6415816..dd14ff31 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -26,11 +26,10 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() + self.alg.switch_to_train() actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - rewards_dict self.save() # * start up storage @@ -140,21 +139,17 @@ def set_up_logger(self): logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( { - "actor_state_dict": self.alg.actor_critic.actor.state_dict(), - "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "iter": self.it, }, @@ -163,18 +158,19 @@ def save(self): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.it = loaded_dict["iter"] def switch_to_eval(self): - self.alg.actor_critic.eval() + self.alg.actor.eval() + self.alg.critic.eval() def get_inference_actions(self): obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) - return self.alg.actor_critic.actor.act_inference(obs) + return self.alg.actor.act_inference(obs) def export(self, path): - self.alg.actor_critic.export_policy(path) + self.alg.actor.export_policy(path) From 10247bea65750a7e8f55fc36d4c3881aee47bfe1 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 13 Mar 2024 15:17:19 -0400 Subject: [PATCH 21/69] WIP: more clean post actor/critic, exporting --- learning/algorithms/ppo2.py | 3 --- learning/runners/on_policy_runner.py | 2 +- setup.py | 5 ++-- .../test_runner_integration.py | 25 ++++++++----------- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index d0d40627..98a4904c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -9,8 +9,6 @@ class PPO2: - # actor_critic: ActorCritic - def __init__( self, actor, @@ -36,7 +34,6 @@ def __init__( self.learning_rate = learning_rate # * PPO components - # todo keeping actor_critic for loading code in runner self.actor = actor.to(self.device) self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index dd14ff31..dc0aa2bd 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -173,4 +173,4 @@ def get_inference_actions(self): return self.alg.actor.act_inference(obs) def export(self, path): - self.alg.actor.export_policy(path) + self.alg.actor.export(path) diff --git a/setup.py b/setup.py index c918d911..8a3685da 100644 --- a/setup.py +++ b/setup.py @@ -2,17 +2,16 @@ from distutils.core import setup setup( - name="gpuGym", + name="pkGym", version="1.0.1", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), - description="Isaac Gym environments for Legged Robots", + description="Isaac Gym environments", install_requires=[ "isaacgym", "matplotlib", "pandas", - "tensorboard", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 4781a665..ef2fc6bb 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -7,7 +7,7 @@ # * imports for onnx test import onnx import onnxruntime -from learning.modules import ActorCritic +from learning.modules import Actor from gym.utils.helpers import get_load_path from gym import LEGGED_GYM_ROOT_DIR @@ -28,20 +28,17 @@ def learn_policy(args): def load_saved_policy(runner): num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_critic_obs = runner.get_obs_size(runner.policy_cfg["critic_obs"]) num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **runner.policy_cfg - ).to(runner.device) + actor = Actor(num_actor_obs, num_actions, **runner.policy_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) loaded_dict = torch.load(resume_path) - actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - actor_critic.eval() - return actor_critic + actor.load_state_dict(loaded_dict["actor_state_dict"]) + actor.eval() + return actor class TestDefaultIntegration: @@ -96,19 +93,19 @@ def test_default_integration_settings(self, args): ) obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) - actions_first = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) - actions_loaded = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() assert torch.equal(actions_first, actions_loaded), "Model loading failed" # * test onnx exporting - loaded_actor_critic = load_saved_policy(runner) + loaded_actor = load_saved_policy(runner) export_path = os.path.join( LEGGED_GYM_ROOT_DIR, "tests", "integration_tests", "exported_policy" ) - loaded_actor_critic.export_policy(export_path) + loaded_actor.export(export_path) # load the exported onnx path_to_onnx = os.path.join(export_path, "policy.onnx") @@ -122,8 +119,8 @@ def test_default_integration_settings(self, args): # compute torch output with torch.no_grad(): test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] - runner_out = runner.alg.actor_critic.actor.act_inference(test_input) - loaded_out = loaded_actor_critic.actor.act_inference(test_input) + runner_out = runner.alg.actor.act_inference(test_input) + loaded_out = loaded_actor.act_inference(test_input) # compute ONNX Runtime output prediction ort_inputs = {ort_session.get_inputs()[0].name: test_input.cpu().numpy()} From 2cf94abd0088e97a2c7070b61fe57971713d7e37 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 13 Mar 2024 18:20:53 -0400 Subject: [PATCH 22/69] Updating all envs and configs to have split actor and critic --- gym/envs/a1/a1_config.py | 35 ++++++++-------- .../anymal_c/flat/anymal_c_flat_config.py | 35 ++++++++-------- .../mixed_terrains/anymal_c_rough_config.py | 7 +++- gym/envs/base/fixed_robot_config.py | 4 +- gym/envs/base/legged_robot_config.py | 23 +++++++---- gym/envs/cartpole/cartpole_config.py | 23 ++++++++--- gym/envs/mini_cheetah/mini_cheetah_config.py | 38 +++++++++-------- .../mini_cheetah/mini_cheetah_osc_config.py | 8 ++-- .../mini_cheetah/mini_cheetah_ref_config.py | 35 ++++++++++------ .../mit_humanoid/humanoid_running_config.py | 32 +++++++++++---- gym/envs/mit_humanoid/mit_humanoid_config.py | 26 +++++++++--- gym/envs/pendulum/pendulum_config.py | 24 ++++++----- gym/scripts/sweep_configs/sweep_pendulum.json | 8 ++-- learning/algorithms/ppo.py | 38 ++--------------- learning/modules/actor.py | 9 +--- learning/modules/critic.py | 9 +--- learning/runners/BaseRunner.py | 15 +++---- learning/runners/data_logging_runner.py | 41 +++++++++++-------- learning/runners/my_runner.py | 16 ++++---- learning/runners/old_policy_runner.py | 28 ++++++------- learning/runners/on_policy_runner.py | 22 +++++----- learning/utils/PBRS/test_usecase.py | 8 ++-- scripts/lqrc/brute_force_value_function.py | 6 +-- scripts/lqrc/rollout_init_conditions.py | 4 +- scripts/play.py | 4 +- .../test_runner_integration.py | 12 +++--- tests/regression_tests/run_n_iterations.py | 2 +- 27 files changed, 274 insertions(+), 238 deletions(-) diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index cc19d8f3..190ce41a 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -138,12 +138,11 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -154,7 +153,19 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 + projected_gravity = 0.02 + + class critic: + hidden_dims = [256, 256, 256] + # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -165,16 +176,8 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 1.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 @@ -213,7 +216,7 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "a1" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" # per iteration # (n_steps in Rudin 2021 paper - batch_size = n_steps * n_robots) num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 4ce7f214..95d95be9 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -121,13 +121,12 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -138,7 +137,19 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 # 0.027, 0.14, 0.37 + projected_gravity = 0.02 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [256, 256, 256] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -149,16 +160,8 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 # 0.027, 0.14, 0.37 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 3.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 @@ -196,6 +199,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "flat_anymal_c" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" max_iterations = 1000 # number of policy updates num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py index badc3faa..85b0c42b 100644 --- a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py +++ b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py @@ -56,12 +56,15 @@ class domain_rand(AnymalCFlatCfg.domain_rand): class AnymalCRoughCCfgPPO(AnymalCFlatCfgPPO): - class policy(AnymalCFlatCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] + class actor(AnymalCFlatCfgPPO.actor): + hidden_dims = [128, 64, 32] critic_hidden_dims = [128, 64, 32] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + class critic(AnymalCFlatCfgPPO.critic): + pass + class algorithm(AnymalCFlatCfgPPO.algorithm): entropy_coef = 0.01 diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 4d039f74..8c7427f8 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -125,7 +125,7 @@ class logging: class policy: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -134,7 +134,7 @@ class policy: # rnn_hidden_size = 512 # rnn_num_layers = 1 - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 2955f5a5..7e6cabc6 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -233,24 +233,18 @@ class LeggedRobotRunnerCfg(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] actions = ["q_des"] disable_actions = False @@ -263,6 +257,17 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] + class reward: class weights: tracking_lin_vel = 0.0 diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index d8a19292..f18d5a3b 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -71,11 +71,10 @@ class policy(FixedRobotCfgPPO.policy): init_noise_std = 1.0 num_layers = 2 num_units = 32 - actor_hidden_dims = [num_units] * num_layers - critic_hidden_dims = [num_units] * num_layers + hidden_dims = [num_units] * num_layers activation = "elu" - actor_obs = [ + obs = [ "cart_obs", "pole_trig_obs", "dof_vel", @@ -83,8 +82,6 @@ class policy(FixedRobotCfgPPO.policy): "pole_vel_square", ] - critic_obs = actor_obs - actions = ["tau_ff"] class noise: @@ -94,6 +91,20 @@ class noise: pole_vel = 0.010 actuation = 0.00 + class critic: + num_layers = 2 + num_units = 32 + hidden_dims = [num_units] * num_layers + activation = "elu" + + obs = [ + "cart_obs", + "pole_trig_obs", + "dof_vel", + "cart_vel_square", + "pole_vel_square", + ] + class reward: class weights: pole_pos = 5 @@ -125,7 +136,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 96 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 689fcd64..28c70123 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -126,13 +126,12 @@ class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [128, 64] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -141,19 +140,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_target", ] - critic_obs = [ - "base_height", - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "commands", - "dof_pos_obs", - "dof_vel", - "dof_pos_target", - ] - actions = ["dof_pos_target"] add_noise = True + disable_actions = False class noise: scale = 1.0 @@ -165,8 +154,23 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class critic: + hidden_dims = [128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_target", + ] + + class reward: + class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..a4acefa6 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -161,13 +161,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] + class policy: + hidden_dims = [256, 256, 128] critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -244,5 +244,5 @@ class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "FullSend" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7911206a..56d369c0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,15 +67,14 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -84,7 +83,25 @@ class policy(MiniCheetahRunnerCfg.policy): "phase_obs", ] - critic_obs = [ + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -96,12 +113,6 @@ class policy(MiniCheetahRunnerCfg.policy): "dof_pos_target", ] - actions = ["dof_pos_target"] - disable_actions = False - - class noise: - pass - class reward: class weights: tracking_lin_vel = 4.0 diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 37852232..8f072f79 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -165,7 +165,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False - disable_actions = False disable_motors = False # (1: disable, 0: enable...bitwise filter) @@ -201,15 +200,14 @@ class HumanoidRunningRunnerCfg(LeggedRobotRunnerCfg): do_wandb = True seed = -1 - class policy(LeggedRobotRunnerCfg.policy): + class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + hidden_dims = [256, 256, 256] # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) activation = "elu" normalize_obs = True # True, False - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -221,10 +219,8 @@ class policy(LeggedRobotRunnerCfg.policy): "in_contact", ] - critic_obs = actor_obs - actions = ["dof_pos_target_legs"] - + disable_actions = False add_noise = True noise_level = 1.0 # scales other values @@ -237,6 +233,24 @@ class noise: dof_vel = 0.01 in_contact = 0.1 + class critic: + hidden_dims = [256, 256, 256] + # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) + activation = "elu" + normalize_obs = True # True, False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "phase_obs", + "dof_pos_legs", + "dof_vel_legs", + "in_contact", + ] + class reward: class weights: # * Behavioral rewards * # @@ -275,7 +289,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 max_iterations = 1000 run_name = "HumanoidRunning" diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..86a8d606 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -181,14 +181,14 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -198,9 +198,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_history", ] - critic_obs = actor_obs actions = ["dof_pos_target"] + disable_actions = False class noise: base_height = 0.05 @@ -211,6 +211,22 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + ] + class reward: class weights: tracking_ang_vel = 0.5 @@ -248,7 +264,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 5c92c448..920356cd 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -59,26 +59,33 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "DataLoggingRunner" - class policy(FixedRobotCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] + class actor: + hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - actor_obs = [ + obs = [ "dof_pos", "dof_vel", ] - critic_obs = actor_obs - actions = ["tau_ff"] disable_actions = False - standard_critic_nn = True class noise: dof_pos = 0.0 dof_vel = 0.0 + class critic: + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + standard_critic_nn = True + class reward: class weights: theta = 0.0 @@ -110,11 +117,6 @@ class algorithm(FixedRobotCfgPPO.algorithm): standard_loss = True plus_c_penalty = 0.1 - class critic: - standard_nn = True - hidden_dims = [128, 64, 32] - activation = "tanh" - class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" diff --git a/gym/scripts/sweep_configs/sweep_pendulum.json b/gym/scripts/sweep_configs/sweep_pendulum.json index 050d2457..b8259a0e 100644 --- a/gym/scripts/sweep_configs/sweep_pendulum.json +++ b/gym/scripts/sweep_configs/sweep_pendulum.json @@ -6,16 +6,16 @@ }, "run_cap": 50, "parameters": { - "train_cfg.policy.reward.weights.theta":{ + "train_cfg.critic.reward.weights.theta":{ "values": [1.0, 10.0] }, - "train_cfg.policy.reward.weights.omega":{ + "train_cfg.critic.reward.weights.omega":{ "values": [1.0, 10.0] }, - "train_cfg.policy.reward.weights.equilibrium":{ + "train_cfg.critic.reward.weights.equilibrium":{ "values": [1.0, 10.0] }, - "train_cfg.policy.reward.weights.energy":{ + "train_cfg.critic.reward.weights.energy":{ "values": [1.0, 10.0] } } diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 5befdf4e..92898ad5 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -36,7 +36,6 @@ from learning.modules import ActorCritic from learning.storage import RolloutStorage -from learning.modules.lqrc import CustomCholeskyPlusConstLoss class PPO: @@ -58,8 +57,6 @@ def __init__( schedule="fixed", desired_kl=0.01, device="cpu", - standard_loss=True, - plus_c_penalty=0.0, **kwargs, ): self.device = device @@ -91,10 +88,6 @@ def __init__( self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - # * custom NN parameters - self.standard_loss = standard_loss - self.plus_c_penalty = plus_c_penalty - def init_storage( self, num_envs, @@ -216,36 +209,11 @@ def update(self): value_clipped = target_values_batch + ( value_batch - target_values_batch ).clamp(-self.clip_param, self.clip_param) - if self.standard_loss: - value_losses = (value_batch - returns_batch).pow(2) - value_losses_clipped = (value_clipped - returns_batch).pow(2) - else: - value_losses = CustomCholeskyPlusConstLoss( - const_penalty=self.plus_c_penalty - ).forward( - value_batch, - returns_batch, - self.actor_critic.critic.NN.intermediates, - ) - value_losses_clipped = CustomCholeskyPlusConstLoss( - const_penalty=self.plus_c_penalty - ).forward( - value_clipped, - returns_batch, - self.actor_critic.critic.NN.intermediates, - ) + value_losses = (value_batch - returns_batch).pow(2) + value_losses_clipped = (value_clipped - returns_batch).pow(2) value_loss = torch.max(value_losses, value_losses_clipped).mean() else: - if self.standard_loss: - value_loss = (returns_batch - value_batch).pow(2).mean() - else: - value_losses = CustomCholeskyPlusConstLoss( - const_penalty=self.plus_c_penalty - ).forward( - value_batch, - returns_batch, - self.actor_critic.critic.NN.intermediates, - ) + value_loss = (returns_batch - value_batch).pow(2).mean() loss = ( surrogate_loss diff --git a/learning/modules/actor.py b/learning/modules/actor.py index e4ecce75..fbaa6868 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -11,17 +11,12 @@ def __init__( self, num_obs, num_actions, - actor_hidden_dims, + hidden_dims, activation="elu", init_noise_std=1.0, normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Actor.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self._normalize_obs = normalize_obs @@ -30,7 +25,7 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - self.NN = create_MLP(num_obs, num_actions, actor_hidden_dims, activation) + self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 4a7e1c12..2733ee40 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -10,21 +10,16 @@ class Critic(nn.Module): def __init__( self, num_obs, - critic_hidden_dims=None, + hidden_dims=None, activation="elu", normalize_obs=True, standard_nn=True, **kwargs, ): - if kwargs: - print( - "Critic.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self.NN = ( - create_MLP(num_obs, 1, critic_hidden_dims, activation) + create_MLP(num_obs, 1, hidden_dims, activation) if standard_nn else CholeskyPlusConst(num_obs) ) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 6727c13a..9e5ab2e3 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -19,19 +19,20 @@ def __init__(self, env, train_cfg, device="cpu"): self._set_up_alg() def _set_up_alg(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - actor = Actor(num_actor_obs, num_actions, **self.policy_cfg) - critic = Critic(num_critic_obs, **self.policy_cfg) + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) alg_class = eval(self.cfg["algorithm_class_name"]) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] - remove_zero_weighted_rewards(train_cfg["policy"]["reward"]["weights"]) - self.policy_cfg = train_cfg["policy"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + self.actor_cfg = train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] def init_storage(self): raise NotImplementedError diff --git a/learning/runners/data_logging_runner.py b/learning/runners/data_logging_runner.py index ac2a9232..dbedee99 100644 --- a/learning/runners/data_logging_runner.py +++ b/learning/runners/data_logging_runner.py @@ -23,14 +23,23 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + # def _set_up_alg(self): + # num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + # num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + # num_actions = self.get_action_size(self.actor_cfg["actions"]) + # actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + # critic = Critic(num_critic_obs, **self.critic_cfg) + # alg_class = eval(self.cfg["algorithm_class_name"]) + # self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) self.all_obs[: self.env.num_envs, :] = actor_obs @@ -63,9 +72,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -79,9 +88,9 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) start = self.env.num_envs * self.it end = self.env.num_envs * (self.it + 1) @@ -129,7 +138,7 @@ def learn(self): "logs", "lqrc", "standard_training_data.npy" - if self.policy_cfg["standard_critic_nn"] + if self.critic_cfg["standard_critic_nn"] else "custom_training_data.npy", ) @@ -144,29 +153,25 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 7cc65441..40c6ef4e 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -25,8 +25,8 @@ def learn(self): rewards_dict = {} self.alg.switch_to_train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -57,9 +57,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -73,9 +73,9 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["actor_obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -115,9 +115,9 @@ def learn(self): self.save() def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 9a738b93..6d8edf35 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -25,8 +25,8 @@ def learn(self): rewards_dict = {} self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -40,17 +40,17 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -84,21 +84,21 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( @@ -134,16 +134,16 @@ def switch_to_eval(self): self.alg.actor_critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor_critic.actor.act_inference(obs) def export(self, path): self.alg.actor_critic.export_policy(path) def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) self.alg.init_storage( self.env.num_envs, self.num_steps_per_env, diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index dc0aa2bd..91143d4b 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -27,8 +27,8 @@ def learn(self): rewards_dict = {} self.alg.switch_to_train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -59,9 +59,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -75,9 +75,9 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # * get time_outs timed_out = self.get_timed_out() @@ -119,21 +119,21 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( @@ -169,7 +169,7 @@ def switch_to_eval(self): self.alg.critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor.act_inference(obs) def export(self, path): diff --git a/learning/utils/PBRS/test_usecase.py b/learning/utils/PBRS/test_usecase.py index a84ef547..be5d0412 100644 --- a/learning/utils/PBRS/test_usecase.py +++ b/learning/utils/PBRS/test_usecase.py @@ -30,14 +30,14 @@ def test_PBRS(): logger.initialize(num_envs=1, episode_dt=0.1, total_iterations=100, device="cpu") self_env = MyTask() - # let's create a dummy policy_cfg with just the reward weights - policy_cfg = {"reward": {"weights": {"dummy": 1}}} - policy_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} + # let's create a dummy critic_cfg with just the reward weights + critic_cfg = {"reward": {"weights": {"dummy": 1}}} + critic_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} # and a dummy env PBRS = PotentialBasedRewardShaping( - policy_cfg["reward"]["pbrs_weights"], device="cpu" + critic_cfg["reward"]["pbrs_weights"], device="cpu" ) assert PBRS.get_reward_keys() == ["PBRS_first", "PBRS_second"] diff --git a/scripts/lqrc/brute_force_value_function.py b/scripts/lqrc/brute_force_value_function.py index 17b5a6a1..5634f710 100644 --- a/scripts/lqrc/brute_force_value_function.py +++ b/scripts/lqrc/brute_force_value_function.py @@ -98,15 +98,15 @@ def get_ground_truth(env, runner, train_cfg, grid): ) rewards_dict = {} rewards = np.zeros((runner.num_steps_per_env, env.num_envs)) - reward_list = runner.policy_cfg["reward"]["weights"] + reward_list = runner.critic_cfg["reward"]["weights"] n_steps = runner.num_steps_per_env states_to_log, logs = create_logging_dict(runner, n_steps, reward_list) for i in range(runner.num_steps_per_env): runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() terminated = runner.get_terminated() diff --git a/scripts/lqrc/rollout_init_conditions.py b/scripts/lqrc/rollout_init_conditions.py index b408f81a..8f54dd1d 100644 --- a/scripts/lqrc/rollout_init_conditions.py +++ b/scripts/lqrc/rollout_init_conditions.py @@ -65,9 +65,9 @@ def play(env, runner, train_cfg, init_conditions): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() pos_traj[i, :] = env.dof_pos.detach().cpu().numpy().squeeze() diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..4486daae 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -46,9 +46,9 @@ def play(env, runner, train_cfg): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() env.check_exit() diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index ef2fc6bb..926bbef9 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -27,9 +27,9 @@ def learn_policy(args): def load_saved_policy(runner): - num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor = Actor(num_actor_obs, num_actions, **runner.policy_cfg).to(runner.device) + num_actor_obs = runner.get_obs_size(runner.actor_cfg["obs"]) + num_actions = runner.get_action_size(runner.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **runner.actor_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], @@ -57,7 +57,7 @@ def test_default_integration_settings(self, args): with torch.no_grad(): actions = runner.get_inference_actions() - deployed_actions = runner.env.get_states(runner.policy_cfg["actions"]) + deployed_actions = runner.env.get_states(runner.actor_cfg["actions"]) assert ( torch.equal(actions, torch.zeros_like(actions)) is False ), "Policy returning all zeros" @@ -92,7 +92,7 @@ def test_default_integration_settings(self, args): f"{model_5_path}" "(last iteration) was not saved" ) - obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) + obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"])) actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() @@ -118,7 +118,7 @@ def test_default_integration_settings(self, args): ) # compute torch output with torch.no_grad(): - test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] + test_input = runner.get_obs(runner.actor_cfg["obs"])[0:1] runner_out = runner.alg.actor.act_inference(test_input) loaded_out = loaded_actor.act_inference(test_input) diff --git a/tests/regression_tests/run_n_iterations.py b/tests/regression_tests/run_n_iterations.py index 6a3a4965..29e2f424 100644 --- a/tests/regression_tests/run_n_iterations.py +++ b/tests/regression_tests/run_n_iterations.py @@ -33,7 +33,7 @@ def _run_training_test(output_tensor_file, env, runner): runner.learn() # * get the test values after learning - actions = runner.env.get_states(runner.policy_cfg["actions"]) + actions = runner.env.get_states(runner.actor_cfg["actions"]) # * return the values to the parent for assertion actions.detach().cpu() From 2a04147bbfdfc7c77d35c4d641bcfa84481dbe6b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 14 Mar 2024 13:07:58 -0400 Subject: [PATCH 23/69] fix OldPolicyRunner with PPO (original) --- gym/envs/a1/a1_config.py | 3 ++- gym/envs/anymal_c/flat/anymal_c_flat_config.py | 4 +++- learning/runners/old_policy_runner.py | 15 +++++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index 190ce41a..87477b56 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -137,6 +137,7 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 256] @@ -216,7 +217,7 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "a1" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" # per iteration # (n_steps in Rudin 2021 paper - batch_size = n_steps * n_robots) num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 95d95be9..d18f7cca 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -85,6 +85,7 @@ class push_robots: toggle = True interval_s = 1 max_push_vel_xy = 0.5 + push_box_dims = [0.2, 0.2, 0.2] class domain_rand(LeggedRobotCfg.domain_rand): randomize_base_mass = True @@ -120,6 +121,7 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 256] @@ -199,6 +201,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "flat_anymal_c" - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" max_iterations = 1000 # number of policy updates num_steps_per_env = 24 diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 6d8edf35..42f019a3 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -2,8 +2,9 @@ import torch from learning.utils import Logger - from .BaseRunner import BaseRunner +from learning.algorithms import PPO # noqa: F401 +from learning.modules import ActorCritic, Actor, Critic logger = Logger() @@ -19,6 +20,16 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + actor_critic = ActorCritic(actor, critic) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() @@ -70,7 +81,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") From 9759a4086122740e3dc8cc36983179fc3fc2a990 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 14 Mar 2024 15:10:12 -0400 Subject: [PATCH 24/69] put critic loss_fn into critic module, remove value clipping (very quick testing, clipping value doesn't seem to help) --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/algorithms/ppo2.py | 15 +++------------ learning/modules/critic.py | 3 +++ learning/runners/on_policy_runner.py | 4 ++++ 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 56d369c0..3c92882d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -140,7 +140,7 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # training params value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True + use_clipped_value_loss = True # deprecate for PPO2 clip_param = 0.2 entropy_coef = 0.01 num_learning_epochs = 6 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 98a4904c..971e6241 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -24,6 +24,7 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, + loss_fn="MSE", device="cpu", **kwargs, ): @@ -35,8 +36,8 @@ def __init__( # * PPO components self.actor = actor.to(self.device) - self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) # * PPO parameters @@ -83,17 +84,7 @@ def update_critic(self, data, last_obs=None): generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: value_batch = self.critic.evaluate(batch["critic_obs"]) - if self.use_clipped_value_loss: - target_value_batch = batch["values"] - value_clipped = target_value_batch + ( - value_batch - target_value_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - target_value_batch).pow(2) - value_losses_clipped = (value_clipped - batch["returns"]).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (batch["returns"] - value_batch).pow(2).mean() - + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 2733ee40..2f06e13b 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -32,3 +32,6 @@ def evaluate(self, critic_observations): with torch.no_grad(): critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() + + def loss_fn(self, input, target): + return nn.functional.mse_loss(input, target, reduction="mean") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 91143d4b..2cd6a9d8 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -151,6 +151,7 @@ def save(self): "actor_state_dict": self.alg.actor.state_dict(), "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, }, path, @@ -162,6 +163,9 @@ def load(self, path, load_optimizer=True): self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) self.it = loaded_dict["iter"] def switch_to_eval(self): From 164423b6cbb2c123804b5a0ccf9188f3ee6c0cae Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 15 Mar 2024 16:36:03 -0400 Subject: [PATCH 25/69] removing last_obs from update_critic() --- learning/algorithms/ppo2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 971e6241..5866052c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -74,7 +74,7 @@ def update(self, data, last_obs=None): self.update_critic(data) self.update_actor(data) - def update_critic(self, data, last_obs=None): + def update_critic(self, data): self.mean_value_loss = 0 counter = 0 From 5d6c34caaaf85750b48207e6dfc63e67c20f9f43 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Mon, 18 Mar 2024 14:55:33 -0400 Subject: [PATCH 26/69] Updating lqc files to fit new PPO, runner interfaces --- gym/envs/pendulum/pendulum_config.py | 5 +- gym/utils/helpers.py | 23 +- learning/algorithms/__init__.py | 3 +- learning/algorithms/ppo2_critic_only.py | 169 +++++++++++++++ ..._critic_only.py => ppo_critic_only_old.py} | 0 learning/modules/critic.py | 7 +- learning/runners/__init__.py | 6 +- learning/runners/critic_only_runner.py | 197 +++++++++++++++++- learning/runners/custom_critic_runner.py | 179 ++++++++++++++++ learning/runners/old_critic_only_runner.py | 8 + ...g_runner.py => old_data_logging_runner.py} | 0 11 files changed, 562 insertions(+), 35 deletions(-) create mode 100644 learning/algorithms/ppo2_critic_only.py rename learning/algorithms/{ppo_critic_only.py => ppo_critic_only_old.py} (100%) create mode 100644 learning/runners/custom_critic_runner.py create mode 100644 learning/runners/old_critic_only_runner.py rename learning/runners/{data_logging_runner.py => old_data_logging_runner.py} (100%) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..01fd573d 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -57,7 +57,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "DataLoggingRunner" + runner_class_name = "CustomCriticRunner" class actor: hidden_dims = [128, 64, 32] @@ -77,6 +77,7 @@ class noise: dof_vel = 0.0 class critic: + critic_class_name = "Critic" obs = [ "dof_pos", "dof_vel", @@ -84,7 +85,6 @@ class critic: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - standard_critic_nn = True class reward: class weights: @@ -114,7 +114,6 @@ class algorithm(FixedRobotCfgPPO.algorithm): # GAE_bootstrap_horizon = .0 # [s] desired_kl = 0.01 max_grad_norm = 1.0 - standard_loss = True plus_c_penalty = 0.1 class runner(FixedRobotCfgPPO.runner): diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 4cba1a02..4dd5c6e4 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -182,29 +182,14 @@ def update_cfg_from_args(env_cfg, train_cfg, args): if args.rl_device is not None: train_cfg.runner.device = args.rl_device if args.train_critic_only: - train_cfg.runner.algorithm_class_name = "PPOCriticOnly" + train_cfg.runner.algorithm_class_name = "PPO2CriticOnly" train_cfg.runner_class_name = "CriticOnlyRunner" train_cfg.runner.resume = True train_cfg.runner.experiment_name += "_critic_only" - if args.task == "pendulum" and args.custom_critic: - train_cfg.policy.standard_critic_nn = False - train_cfg.algorithm.standard_loss = False - train_cfg.runner.run_name += "custom_critic_only" - elif args.task == "pendulum" and not args.custom_critic: - train_cfg.policy.standard_critic_nn = True - train_cfg.algorithm.standard_loss = True - train_cfg.runner.run_name += "standard_critic_only" + train_cfg.runner.run_name += "custom_critic_only" if args.task == "pendulum" and args.custom_critic else "standard_critic_only" else: - if args.task == "pendulum" and args.custom_critic: - train_cfg.policy.standard_critic_nn = False - train_cfg.algorithm.standard_loss = False - train_cfg.runner.experiment_name += "_custom_critic" - train_cfg.runner.run_name += "custom_critic" - elif args.task == "pendulum" and not args.custom_critic: - train_cfg.policy.standard_critic_nn = True - train_cfg.algorithm.standard_loss = True - train_cfg.runner.experiment_name += "_standard_critic" - train_cfg.runner.run_name += "standard_critic" + train_cfg.runner.experiment_name += "_custom_critic" if args.task == "pendulum" and args.custom_critic else "_standard_critic" + train_cfg.runner.run_name += "custom_critic" if args.task == "pendulum" and args.custom_critic else "standard_critic" def get_args(custom_parameters=None): diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index 2523f223..aef59018 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -33,4 +33,5 @@ from .ppo import PPO from .ppo2 import PPO2 from .SE import StateEstimator -from .ppo_critic_only import PPOCriticOnly +# from .ppo_critic_only_old import PPOCriticOnly +from .ppo2_critic_only import PPO2CriticOnly diff --git a/learning/algorithms/ppo2_critic_only.py b/learning/algorithms/ppo2_critic_only.py new file mode 100644 index 00000000..e2b48920 --- /dev/null +++ b/learning/algorithms/ppo2_critic_only.py @@ -0,0 +1,169 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from learning.algorithms.ppo2 import PPO2 + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, +) + + +class PPO2CriticOnly(PPO2): + def __init__( + self, + actor, + critic, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + loss_fn="MSE", + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + self.actor = actor.to(self.device) + # self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + def test_mode(self): + self.actor.test() + self.critic.test() + + def switch_to_train(self): + self.actor.train() + self.critic.train() + + def act(self, obs, critic_obs): + return self.actor.act(obs).detach() + + def update(self, data, last_obs=None): + if last_obs is None: + last_values = None + else: + with torch.no_grad(): + last_values = self.critic.evaluate(last_obs).detach() + compute_generalized_advantages( + data, self.gamma, self.lam, self.critic, last_values + ) + + self.update_critic(data) + self.update_actor(data) #! gradient step is disabled + + def update_critic(self, data): + self.mean_value_loss = 0 + counter = 0 + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + value_batch = self.critic.evaluate(batch["critic_obs"]) + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + self.critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) + self.critic_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data): + # already done before + # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) + self.mean_surrogate_loss = 0 + counter = 0 + + self.actor.act(data["actor_obs"]) + data["old_sigma_batch"] = self.actor.action_std.detach() + data["old_mu_batch"] = self.actor.action_mean.detach() + data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( + data["actions"] + ).detach() + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + # ! refactor how this is done + self.actor.act(batch["actor_obs"]) + actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) + mu_batch = self.actor.action_mean + sigma_batch = self.actor.action_std + entropy_batch = self.actor.entropy + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_batch / batch["old_sigma_batch"] + 1.0e-5) + + ( + torch.square(batch["old_sigma_batch"]) + + torch.square(batch["old_mu_batch"] - mu_batch) + ) + / (2.0 * torch.square(sigma_batch)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + # for param_group in self.optimizer.param_groups: + # # ! check this + # param_group["lr"] = self.learning_rate + + # * Surrogate loss + ratio = torch.exp( + actions_log_prob_batch + - torch.squeeze(batch["old_actions_log_prob_batch"]) + ) + surrogate = -torch.squeeze(batch["advantages"]) * ratio + surrogate_clipped = -torch.squeeze(batch["advantages"]) * torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() + + loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + + # * Gradient step + # self.optimizer.zero_grad() + # loss.backward() + # nn.utils.clip_grad_norm_( + # list(self.actor.parameters()) + list(self.critic.parameters()), + # self.max_grad_norm, + # ) + # self.optimizer.step() + self.mean_surrogate_loss += surrogate_loss.item() + counter += 1 + self.mean_surrogate_loss /= counter diff --git a/learning/algorithms/ppo_critic_only.py b/learning/algorithms/ppo_critic_only_old.py similarity index 100% rename from learning/algorithms/ppo_critic_only.py rename to learning/algorithms/ppo_critic_only_old.py diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 2f06e13b..f91429d5 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -13,16 +13,11 @@ def __init__( hidden_dims=None, activation="elu", normalize_obs=True, - standard_nn=True, **kwargs, ): super().__init__() - self.NN = ( - create_MLP(num_obs, 1, hidden_dims, activation) - if standard_nn - else CholeskyPlusConst(num_obs) - ) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index b6b6a3a6..e41cf82b 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -32,6 +32,8 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner -from .data_logging_runner import DataLoggingRunner -from .critic_only_runner import CriticOnlyRunner +# from .old_data_logging_runner import DataLoggingRunner +# from .old_critic_only_runner import CriticOnlyRunner from .old_policy_runner import OldPolicyRunner +from .custom_critic_runner import CustomCriticRunner +from .critic_only_runner import CriticOnlyRunner diff --git a/learning/runners/critic_only_runner.py b/learning/runners/critic_only_runner.py index 8a21f661..045b15ca 100644 --- a/learning/runners/critic_only_runner.py +++ b/learning/runners/critic_only_runner.py @@ -1,8 +1,197 @@ +import os +import numpy as np import torch -from .data_logging_runner import DataLoggingRunner +from tensordict import TensorDict +from gym import LEGGED_GYM_ROOT_DIR +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor, Critic +from learning.utils import Logger -class CriticOnlyRunner(DataLoggingRunner): - def load(self, path): +from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + + +logger = Logger() +storage = DictStorage() + + +class CriticOnlyRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + critic_class_name = self.critic_cfg["critic_class_name"] + num_actions = self.get_action_size(self.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + # critic = Critic(num_critic_obs, **self.critic_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) + self.all_obs[: self.env.num_envs, :] = actor_obs + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + start = self.env.num_envs * self.it + end = self.env.num_envs * (self.it + 1) + self.all_obs[start:end, :] = actor_obs + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "standard_training_data.npy" + if self.critic_cfg["standard_critic_nn"] + else "custom_training_data.npy", + ) + + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") + self.save() + + def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py new file mode 100644 index 00000000..f60098f4 --- /dev/null +++ b/learning/runners/custom_critic_runner.py @@ -0,0 +1,179 @@ +import os +import numpy as np +import torch +from tensordict import TensorDict + +from gym import LEGGED_GYM_ROOT_DIR +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor, Critic +from learning.utils import Logger + +from .on_policy_runner import OnPolicyRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class CustomCriticRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + critic_class_name = self.critic_cfg["critic_class_name"] + num_actions = self.get_action_size(self.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + # critic = Critic(num_critic_obs, **self.critic_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.all_obs = torch.zeros(self.env.num_envs * (tot_iter - self.it + 1), 2) + self.all_obs[: self.env.num_envs, :] = actor_obs + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + start = self.env.num_envs * self.it + end = self.env.num_envs * (self.it + 1) + self.all_obs[start:end, :] = actor_obs + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.all_obs = self.all_obs.detach().cpu().numpy() + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "standard_training_data.npy" + if self.critic_cfg["standard_critic_nn"] + else "custom_training_data.npy", + ) + + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.save(save_path, self.all_obs) + print(f"Saved training observations to {save_path}") + self.save() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/old_critic_only_runner.py b/learning/runners/old_critic_only_runner.py new file mode 100644 index 00000000..3a9de845 --- /dev/null +++ b/learning/runners/old_critic_only_runner.py @@ -0,0 +1,8 @@ +import torch +from .old_data_logging_runner import DataLoggingRunner + + +class CriticOnlyRunner(DataLoggingRunner): + def load(self, path): + loaded_dict = torch.load(path) + self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) diff --git a/learning/runners/data_logging_runner.py b/learning/runners/old_data_logging_runner.py similarity index 100% rename from learning/runners/data_logging_runner.py rename to learning/runners/old_data_logging_runner.py From 0fd15894d3d165764e5eaa3307be088c5a8ccbfd Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:31:02 -0400 Subject: [PATCH 27/69] WIP custom critics with new critic interface --- learning/modules/lqrc/custom_critics.py | 245 ++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 learning/modules/lqrc/custom_critics.py diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py new file mode 100644 index 00000000..a1eccea8 --- /dev/null +++ b/learning/modules/lqrc/custom_critics.py @@ -0,0 +1,245 @@ +from math import floor +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset +from learning.modules.critic import Critic +from learning.modules.utils import get_activation + + +class CustomCriticBaseline(Critic): + + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + nn.Module.__init__() + hidden_dims = [32, 128] if hidden_dims is None else hidden_dims + assert len(hidden_dims) == 2, "Too many hidden dims passed to Custom Critic" + activation = get_activation(activation) + self.input_size = num_obs + self.output_size = output_size + self.device = device + self.intermediates = {} + + self.connection_1 = nn.Linear(self.input_size, hidden_dims[0]) + self.activation_1 = activation + self.connection_2 = nn.Linear(hidden_dims[0], hidden_dims[1]) + self.activation_2 = activation + self.connection_3 = nn.Linear(hidden_dims[1], output_size) + self.activation_3 = nn.Softplus() + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + return output + + +class Cholesky(CustomCriticBaseline): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + super().__init__( + num_obs, + hidden_dims=hidden_dims, + activation=activation, + normalize_obs=normalize_obs, + output_size=sum(range(num_obs + 1)), + device=device, + ) + self.activation_3.register_forward_hook(self.save_intermediate()) + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + y_pred = (x.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x.unsqueeze(2))).squeeze(2) + return y_pred + + def create_cholesky(self, x): + batch_size = x.shape[0] + n = self.input_size + L = torch.zeros((batch_size, n, n), device=self.device, requires_grad=False) + tril_indices = torch.tril_indices(row=n, col=n, offset=0) + rows, cols = tril_indices + L[:, rows, cols] = x + return L + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + self.intermediates["A"] = A + + return hook + + +class CholeskyPlusConst(Cholesky): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + # additional 1 to output_size for +c + super(Cholesky, self).__init__( + num_obs=num_obs, + hidden_dims=hidden_dims, + activation=activation, + normalize_obs=normalize_obs, + output_size=sum(range(num_obs + 1)) + 1, + device=device, + ) + self.activation_3.register_forward_hook(self.save_intermediate()) + self.const_penalty = 0.0 if kwargs.get("const_penalty") is None else kwargs.get("const_penalty") + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output[:, :-1]) + A = C.bmm(C.transpose(1, 2)) + c = output[:, -1] + y_pred = (x.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x.unsqueeze(2))).squeeze( + 2 + ) + c.unsqueeze(1) + return y_pred + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output[:, :-1]) + A = C.bmm(C.transpose(1, 2)) + c = output[:, -1] + self.intermediates["A"] = A + self.intermediates["c"] = c + + return hook + + def loss_fn(self, input, target): + loss = F.mse_loss(input, target, reduction="mean") + const_loss = self.const_penalty * torch.mean(self.intermediates["c"]) + return loss + const_loss + + +class CholeskyOffset1(Cholesky): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + # + input_size for offset + super(Cholesky, self).__init__( + num_obs=num_obs, + hidden_dims=hidden_dims, + activation=activation, + normalize_obs=normalize_obs, + output_size=sum(range(num_obs + 1)) + num_obs, + device=device, + ) + self.L_indices = sum(range(num_obs + 1)) + self.activation_3.register_forward_hook(self.save_intermediate()) + + def forward(self, x): + output = self.connection_1(x) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output[:, : self.L_indices]) + A = C.bmm(C.transpose(1, 2)) + offset = output[:, self.L_indices :] + x_bar = x - offset + y_pred = ( + x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2)) + ).squeeze(2) + return y_pred + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output[:, : self.L_indices]) + A = C.bmm(C.transpose(1, 2)) + offset = output[:, self.L_indices :] + self.intermediates["A"] = A + self.intermediates["offset"] = offset + + return hook + + +class CholeskyOffset2(Cholesky): + def __init__( + self, + num_obs, + hidden_dims=None, + activation="elu", + normalize_obs=True, + output_size=1, + device="cuda", + **kwargs, + ): + super().__init__(num_obs) + self.xhat_layer = nn.Linear(num_obs, num_obs) + self.xhat_layer.register_forward_hook(self.save_xhat()) + + def forward(self, x): + xhat = self.xhat_layer(x) + output = self.connection_1(xhat) + output = self.activation_1(output) + output = self.connection_2(output) + output = self.activation_2(output) + output = self.connection_3(output) + output = self.activation_3(output) + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + y_pred = ( + xhat.unsqueeze(2).transpose(1, 2).bmm(A).bmm(xhat.unsqueeze(2)) + ).squeeze(2) + return y_pred + + def save_intermediate(self): + def hook(module, input, output): + C = self.create_cholesky(output) + A = C.bmm(C.transpose(1, 2)) + self.intermediates["A"] = A + + return hook + + def save_xhat(self): + def hook(module, input, output): + self.intermediates["xhat"] = output + + return hook From 74a615cbfd1346e1f13b0cce68282ece1afd42d8 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 22 Mar 2024 15:51:56 -0400 Subject: [PATCH 28/69] WIP custom critics refactor --- gym/envs/pendulum/pendulum_config.py | 4 ++-- gym/utils/helpers.py | 4 ++++ learning/modules/critic.py | 1 - learning/modules/lqrc/__init__.py | 1 + learning/modules/lqrc/custom_critics.py | 16 +++++++++++----- learning/runners/critic_only_runner.py | 1 + learning/runners/custom_critic_runner.py | 1 + 7 files changed, 20 insertions(+), 8 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 01fd573d..427891de 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,7 +5,7 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**13 + num_envs = 2**12 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 5.0 @@ -77,7 +77,7 @@ class noise: dof_vel = 0.0 class critic: - critic_class_name = "Critic" + critic_class_name = "Cholesky" obs = [ "dof_pos", "dof_vel", diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 4dd5c6e4..6f540cb0 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -181,6 +181,10 @@ def update_cfg_from_args(env_cfg, train_cfg, args): train_cfg.runner.checkpoint = args.checkpoint if args.rl_device is not None: train_cfg.runner.device = args.rl_device + if args.custom_critic: + train_cfg.critic.hidden_dims = None + else: + train_cfg.critic.critic_class_name = "Critic" if args.train_critic_only: train_cfg.runner.algorithm_class_name = "PPO2CriticOnly" train_cfg.runner_class_name = "CriticOnlyRunner" diff --git a/learning/modules/critic.py b/learning/modules/critic.py index f91429d5..744de477 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -3,7 +3,6 @@ from .utils import create_MLP from .utils import RunningMeanStd -from .lqrc import CholeskyPlusConst class Critic(nn.Module): diff --git a/learning/modules/lqrc/__init__.py b/learning/modules/lqrc/__init__.py index f3b1e645..0286940b 100644 --- a/learning/modules/lqrc/__init__.py +++ b/learning/modules/lqrc/__init__.py @@ -1,2 +1,3 @@ from .custom_nn import CholeskyPlusConst, QuadraticNetCholesky, BaselineMLP, CustomCholeskyLoss, CustomCholeskyPlusConstLoss from .utils import benchmark_args +from .custom_critics import CustomCriticBaseline, Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py index a1eccea8..2604f796 100644 --- a/learning/modules/lqrc/custom_critics.py +++ b/learning/modules/lqrc/custom_critics.py @@ -1,10 +1,10 @@ -from math import floor import torch import torch.nn as nn import torch.nn.functional as F -from torch.utils.data import Dataset + from learning.modules.critic import Critic -from learning.modules.utils import get_activation +from learning.modules.utils import RunningMeanStd +from learning.modules.utils.neural_net import get_activation class CustomCriticBaseline(Critic): @@ -19,7 +19,7 @@ def __init__( device="cuda", **kwargs, ): - nn.Module.__init__() + super(Critic, self).__init__() hidden_dims = [32, 128] if hidden_dims is None else hidden_dims assert len(hidden_dims) == 2, "Too many hidden dims passed to Custom Critic" activation = get_activation(activation) @@ -28,6 +28,10 @@ def __init__( self.device = device self.intermediates = {} + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_obs) + self.connection_1 = nn.Linear(self.input_size, hidden_dims[0]) self.activation_1 = activation self.connection_2 = nn.Linear(hidden_dims[0], hidden_dims[1]) @@ -35,6 +39,8 @@ def __init__( self.connection_3 = nn.Linear(hidden_dims[1], output_size) self.activation_3 = nn.Softplus() + self.NN = self.forward + def forward(self, x): output = self.connection_1(x) output = self.activation_1(output) @@ -182,7 +188,7 @@ def forward(self, x): output = self.activation_3(output) C = self.create_cholesky(output[:, : self.L_indices]) A = C.bmm(C.transpose(1, 2)) - offset = output[:, self.L_indices :] + offset = output[:, self.L_indices:] x_bar = x - offset y_pred = ( x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2)) diff --git a/learning/runners/critic_only_runner.py b/learning/runners/critic_only_runner.py index 045b15ca..08af8702 100644 --- a/learning/runners/critic_only_runner.py +++ b/learning/runners/critic_only_runner.py @@ -6,6 +6,7 @@ from gym import LEGGED_GYM_ROOT_DIR from learning.algorithms import * # noqa: F403 from learning.modules import Actor, Critic +from learning.modules.lqrc import Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 from learning.utils import Logger from .on_policy_runner import OnPolicyRunner diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py index f60098f4..82659b86 100644 --- a/learning/runners/custom_critic_runner.py +++ b/learning/runners/custom_critic_runner.py @@ -6,6 +6,7 @@ from gym import LEGGED_GYM_ROOT_DIR from learning.algorithms import * # noqa: F403 from learning.modules import Actor, Critic +from learning.modules.lqrc import Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 from learning.utils import Logger from .on_policy_runner import OnPolicyRunner From de64f6c201831ba888c4fba4825996506ff6ae21 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 10:53:01 -0400 Subject: [PATCH 29/69] WIP: more clean post actor/critic, exporting --- learning/algorithms/ppo2.py | 13 +++------- learning/runners/on_policy_runner.py | 19 +++++++------- setup.py | 5 ++-- .../test_runner_integration.py | 25 ++++++++----------- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 69d6dcd2..bc044d08 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -9,11 +9,10 @@ class PPO2: - # actor_critic: ActorCritic - def __init__( self, - actor_critic, + actor, + critic, num_learning_epochs=1, num_mini_batches=1, clip_param=0.2, @@ -35,12 +34,8 @@ def __init__( self.learning_rate = learning_rate # * PPO components - # todo keeping actor_critic for loading code in runner - self.actor_critic = actor_critic.to(self.device) - self.actor = actor_critic.actor.to(self.device) - self.critic = actor_critic.critic.to(self.device) - self.storage = None # initialized later - # parameters = list(self.actor.parameters()) + list(self.critic.parameters()) + self.actor = actor.to(self.device) + self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 91e2ac26..b93ff0de 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -144,17 +144,15 @@ def set_up_logger(self): "actor", self.alg.actor_critic, ["action_std", "entropy"] ) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( { - "actor_state_dict": self.alg.actor_critic.actor.state_dict(), - "critic_state_dict": self.alg.actor_critic.critic.state_dict(), + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "iter": self.it, }, @@ -163,18 +161,19 @@ def save(self): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) - self.alg.actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.actor_critic.critic.load_state_dict(loaded_dict["critic_state_dict"]) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.it = loaded_dict["iter"] def switch_to_eval(self): - self.alg.actor_critic.eval() + self.alg.actor.eval() + self.alg.critic.eval() def get_inference_actions(self): obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) - return self.alg.actor_critic.actor.act_inference(obs) + return self.alg.actor.act_inference(obs) def export(self, path): - self.alg.actor_critic.export_policy(path) + self.alg.actor.export(path) diff --git a/setup.py b/setup.py index c918d911..8a3685da 100644 --- a/setup.py +++ b/setup.py @@ -2,17 +2,16 @@ from distutils.core import setup setup( - name="gpuGym", + name="pkGym", version="1.0.1", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), - description="Isaac Gym environments for Legged Robots", + description="Isaac Gym environments", install_requires=[ "isaacgym", "matplotlib", "pandas", - "tensorboard", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 4781a665..ef2fc6bb 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -7,7 +7,7 @@ # * imports for onnx test import onnx import onnxruntime -from learning.modules import ActorCritic +from learning.modules import Actor from gym.utils.helpers import get_load_path from gym import LEGGED_GYM_ROOT_DIR @@ -28,20 +28,17 @@ def learn_policy(args): def load_saved_policy(runner): num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_critic_obs = runner.get_obs_size(runner.policy_cfg["critic_obs"]) num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **runner.policy_cfg - ).to(runner.device) + actor = Actor(num_actor_obs, num_actions, **runner.policy_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], checkpoint=runner.cfg["checkpoint"], ) loaded_dict = torch.load(resume_path) - actor_critic.actor.load_state_dict(loaded_dict["actor_state_dict"]) - actor_critic.eval() - return actor_critic + actor.load_state_dict(loaded_dict["actor_state_dict"]) + actor.eval() + return actor class TestDefaultIntegration: @@ -96,19 +93,19 @@ def test_default_integration_settings(self, args): ) obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) - actions_first = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) - actions_loaded = runner.alg.actor_critic.act_inference(obs).cpu().clone() + actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() assert torch.equal(actions_first, actions_loaded), "Model loading failed" # * test onnx exporting - loaded_actor_critic = load_saved_policy(runner) + loaded_actor = load_saved_policy(runner) export_path = os.path.join( LEGGED_GYM_ROOT_DIR, "tests", "integration_tests", "exported_policy" ) - loaded_actor_critic.export_policy(export_path) + loaded_actor.export(export_path) # load the exported onnx path_to_onnx = os.path.join(export_path, "policy.onnx") @@ -122,8 +119,8 @@ def test_default_integration_settings(self, args): # compute torch output with torch.no_grad(): test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] - runner_out = runner.alg.actor_critic.actor.act_inference(test_input) - loaded_out = loaded_actor_critic.actor.act_inference(test_input) + runner_out = runner.alg.actor.act_inference(test_input) + loaded_out = loaded_actor.act_inference(test_input) # compute ONNX Runtime output prediction ort_inputs = {ort_session.get_inputs()[0].name: test_input.cpu().numpy()} From 8dc7b40888d7f32e4154092172ef557608611988 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 11:08:09 -0400 Subject: [PATCH 30/69] Updating all envs and configs to have split actor and critic --- gym/envs/a1/a1_config.py | 35 ++--- .../anymal_c/flat/anymal_c_flat_config.py | 35 ++--- .../mixed_terrains/anymal_c_rough_config.py | 7 +- gym/envs/base/fixed_robot_config.py | 4 +- gym/envs/base/legged_robot_config.py | 23 ++-- gym/envs/cartpole/cartpole_config.py | 23 +++- gym/envs/mini_cheetah/mini_cheetah_config.py | 38 +++--- .../mini_cheetah/mini_cheetah_osc_config.py | 8 +- .../mini_cheetah/mini_cheetah_ref_config.py | 35 +++-- .../mit_humanoid/humanoid_running_config.py | 34 +++-- gym/envs/mit_humanoid/mit_humanoid_config.py | 26 +++- gym/envs/pendulum/pendulum_config.py | 125 ++++++++++++++++++ learning/algorithms/ppo2.py | 2 +- learning/modules/__init__.py | 1 + learning/modules/actor.py | 5 - learning/modules/critic.py | 7 +- learning/runners/BaseRunner.py | 27 ++-- learning/runners/my_runner.py | 18 +-- learning/runners/old_policy_runner.py | 28 ++-- learning/runners/on_policy_runner.py | 29 ++-- learning/utils/PBRS/test_usecase.py | 8 +- scripts/play.py | 4 +- .../test_runner_integration.py | 16 +-- tests/regression_tests/run_n_iterations.py | 2 +- 24 files changed, 363 insertions(+), 177 deletions(-) create mode 100644 gym/envs/pendulum/pendulum_config.py diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index cc19d8f3..190ce41a 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -138,12 +138,11 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -154,7 +153,19 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 + projected_gravity = 0.02 + + class critic: + hidden_dims = [256, 256, 256] + # activation can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -165,16 +176,8 @@ class policy(LeggedRobotRunnerCfg.policy): "commands", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 1.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 @@ -213,7 +216,7 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "a1" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" # per iteration # (n_steps in Rudin 2021 paper - batch_size = n_steps * n_robots) num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 4ce7f214..95d95be9 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -121,13 +121,12 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + class actor(LeggedRobotRunnerCfg.actor): + hidden_dims = [256, 256, 256] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -138,7 +137,19 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - critic_obs = [ + actions = ["dof_pos_target"] + + class noise: + dof_pos_obs = 0.005 + dof_vel = 0.005 + base_ang_vel = 0.05 # 0.027, 0.14, 0.37 + projected_gravity = 0.02 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [256, 256, 256] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -149,16 +160,8 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_history", ] - actions = ["dof_pos_target"] - - class noise: - dof_pos_obs = 0.005 # can be made very low - dof_vel = 0.005 - base_ang_vel = 0.05 # 0.027, 0.14, 0.37 - projected_gravity = 0.02 - - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class reward: + class weights: tracking_lin_vel = 3.0 tracking_ang_vel = 1.0 lin_vel_z = 0.0 @@ -196,6 +199,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "flat_anymal_c" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" max_iterations = 1000 # number of policy updates num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py index badc3faa..85b0c42b 100644 --- a/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py +++ b/gym/envs/anymal_c/mixed_terrains/anymal_c_rough_config.py @@ -56,12 +56,15 @@ class domain_rand(AnymalCFlatCfg.domain_rand): class AnymalCRoughCCfgPPO(AnymalCFlatCfgPPO): - class policy(AnymalCFlatCfgPPO.policy): - actor_hidden_dims = [128, 64, 32] + class actor(AnymalCFlatCfgPPO.actor): + hidden_dims = [128, 64, 32] critic_hidden_dims = [128, 64, 32] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + class critic(AnymalCFlatCfgPPO.critic): + pass + class algorithm(AnymalCFlatCfgPPO.algorithm): entropy_coef = 0.01 diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 4d039f74..8c7427f8 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -125,7 +125,7 @@ class logging: class policy: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -134,7 +134,7 @@ class policy: # rnn_hidden_size = 512 # rnn_num_layers = 1 - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 2955f5a5..7e6cabc6 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -233,24 +233,18 @@ class LeggedRobotRunnerCfg(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True - actor_obs = [ + obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] actions = ["q_des"] disable_actions = False @@ -263,6 +257,17 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] + class reward: class weights: tracking_lin_vel = 0.0 diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index d8a19292..f18d5a3b 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -71,11 +71,10 @@ class policy(FixedRobotCfgPPO.policy): init_noise_std = 1.0 num_layers = 2 num_units = 32 - actor_hidden_dims = [num_units] * num_layers - critic_hidden_dims = [num_units] * num_layers + hidden_dims = [num_units] * num_layers activation = "elu" - actor_obs = [ + obs = [ "cart_obs", "pole_trig_obs", "dof_vel", @@ -83,8 +82,6 @@ class policy(FixedRobotCfgPPO.policy): "pole_vel_square", ] - critic_obs = actor_obs - actions = ["tau_ff"] class noise: @@ -94,6 +91,20 @@ class noise: pole_vel = 0.010 actuation = 0.00 + class critic: + num_layers = 2 + num_units = 32 + hidden_dims = [num_units] * num_layers + activation = "elu" + + obs = [ + "cart_obs", + "pole_trig_obs", + "dof_vel", + "cart_vel_square", + "pole_vel_square", + ] + class reward: class weights: pole_pos = 5 @@ -125,7 +136,7 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 96 # per iteration max_iterations = 500 # number of policy updates diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 689fcd64..28c70123 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -126,13 +126,12 @@ class MiniCheetahRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [128, 64] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -141,19 +140,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_target", ] - critic_obs = [ - "base_height", - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "commands", - "dof_pos_obs", - "dof_vel", - "dof_pos_target", - ] - actions = ["dof_pos_target"] add_noise = True + disable_actions = False class noise: scale = 1.0 @@ -165,8 +154,23 @@ class noise: ang_vel = [0.3, 0.15, 0.4] gravity_vec = 0.1 - class reward(LeggedRobotRunnerCfg.policy.reward): - class weights(LeggedRobotRunnerCfg.policy.reward.weights): + class critic: + hidden_dims = [128, 64] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_target", + ] + + class reward: + class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..a4acefa6 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -161,13 +161,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] + class policy: + hidden_dims = [256, 256, 128] critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -244,5 +244,5 @@ class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "FullSend" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7911206a..56d369c0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,15 +67,14 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" - class policy(MiniCheetahRunnerCfg.policy): - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + class actor: + hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True - actor_obs = [ + obs = [ "base_ang_vel", "projected_gravity", "commands", @@ -84,7 +83,25 @@ class policy(MiniCheetahRunnerCfg.policy): "phase_obs", ] - critic_obs = [ + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -96,12 +113,6 @@ class policy(MiniCheetahRunnerCfg.policy): "dof_pos_target", ] - actions = ["dof_pos_target"] - disable_actions = False - - class noise: - pass - class reward: class weights: tracking_lin_vel = 4.0 diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 86a113f4..170e2570 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -165,7 +165,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False - disable_actions = False disable_motors = False # (1: disable, 0: enable...bitwise filter) @@ -201,15 +200,14 @@ class HumanoidRunningRunnerCfg(LeggedRobotRunnerCfg): do_wandb = True seed = -1 - class policy(LeggedRobotRunnerCfg.policy): + class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 256] - critic_hidden_dims = [256, 256, 256] + hidden_dims = [256, 256, 256] # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) activation = "elu" normalize_obs = True # True, False - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -221,10 +219,8 @@ class policy(LeggedRobotRunnerCfg.policy): "in_contact", ] - critic_obs = actor_obs - actions = ["dof_pos_target_legs"] - + disable_actions = False add_noise = True noise_level = 1.0 # scales other values @@ -237,6 +233,24 @@ class noise: dof_vel = 0.01 in_contact = 0.1 + class critic: + hidden_dims = [256, 256, 256] + # (elu, relu, selu, crelu, lrelu, tanh, sigmoid) + activation = "elu" + normalize_obs = True # True, False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "phase_obs", + "dof_pos_legs", + "dof_vel_legs", + "in_contact", + ] + class reward: class weights: # * Behavioral rewards * # @@ -275,8 +289,8 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" - num_steps_per_env = 24 + algorithm_class_name = "PPO2" + num_steps_per_env = 32 max_iterations = 1000 run_name = "ICRA2023" experiment_name = "HumanoidLocomotion" diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..86a8d606 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -181,14 +181,14 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(LeggedRobotRunnerCfg.policy): + class actor: init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] + hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - actor_obs = [ + obs = [ "base_height", "base_lin_vel", "base_ang_vel", @@ -198,9 +198,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_vel", "dof_pos_history", ] - critic_obs = actor_obs actions = ["dof_pos_target"] + disable_actions = False class noise: base_height = 0.05 @@ -211,6 +211,22 @@ class noise: projected_gravity = 0.05 height_measurements = 0.1 + class critic: + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + ] + class reward: class weights: tracking_ang_vel = 0.5 @@ -248,7 +264,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py new file mode 100644 index 00000000..920356cd --- /dev/null +++ b/gym/envs/pendulum/pendulum_config.py @@ -0,0 +1,125 @@ +import torch + +from gym.envs.base.fixed_robot_config import FixedRobotCfg, FixedRobotCfgPPO + + +class PendulumCfg(FixedRobotCfg): + class env(FixedRobotCfg.env): + num_envs = 2**13 + num_actuators = 1 # 1 for theta connecting base and pole + episode_length_s = 5.0 + + class terrain(FixedRobotCfg.terrain): + pass + + class viewer: + ref_env = 0 + pos = [10.0, 5.0, 10.0] # [m] + lookat = [0.0, 0.0, 0.0] # [m] + + class init_state(FixedRobotCfg.init_state): + default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position + # * "reset_to_range" = uniformly random from a range defined below + reset_mode = "reset_to_range" + + # * initial conditions for reset_to_range + dof_pos_range = { + "theta": [-torch.pi, torch.pi], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(FixedRobotCfg.control): + actuated_joints_mask = [1] # angle + ctrl_frequency = 100 + desired_sim_frequency = 200 + stiffness = {"theta": 0.0} # [N*m/rad] + damping = {"theta": 0.0} # [N*m*s/rad] + + class asset(FixedRobotCfg.asset): + # * Things that differ + file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" + disable_gravity = False + disable_motors = False # all torques set to 0 + joint_damping = 0.1 + + class reward_settings(FixedRobotCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(FixedRobotCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + # * Action scales + tau_ff = 1.0 + + +class PendulumRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "DataLoggingRunner" + + class actor: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + obs = [ + "dof_pos", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + standard_critic_nn = True + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.0 + dof_vel = 0.0 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + # training params + value_loss_coef = 1.0 + use_clipped_value_loss = True + clip_param = 0.2 + entropy_coef = 0.01 + num_learning_epochs = 6 + # * mini batch size = num_envs*nsteps / nminibatches + num_mini_batches = 4 + learning_rate = 1.0e-3 + schedule = "fixed" # could be adaptive, fixed + discount_horizon = 2.0 # [s] + lam = 0.98 + # GAE_bootstrap_horizon = .0 # [s] + desired_kl = 0.01 + max_grad_norm = 1.0 + standard_loss = True + plus_c_penalty = 0.1 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 500 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index bc044d08..98a4904c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -53,7 +53,7 @@ def test_mode(self): self.actor.test() self.critic.test() - def train_mode(self): + def switch_to_train(self): self.actor.train() self.critic.train() diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 75cca5e2..3caf5fec 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -33,3 +33,4 @@ from .actor_critic import ActorCritic from .state_estimator import StateEstimatorNN from .actor import Actor +from .critic import Critic diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 7f40d502..fbaa6868 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -17,11 +17,6 @@ def __init__( normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Actor.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self._normalize_obs = normalize_obs diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 7ebe76c8..0bc890c7 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -8,16 +8,11 @@ class Critic(nn.Module): def __init__( self, num_obs, - hidden_dims, + hidden_dims=None, activation="elu", normalize_obs=True, **kwargs, ): - if kwargs: - print( - "Critic.__init__ got unexpected arguments, " - "which will be ignored: " + str([key for key in kwargs.keys()]) - ) super().__init__() self.NN = create_MLP(num_obs, 1, hidden_dims, activation) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index d41e7f07..9e5ab2e3 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import ActorCritic +from learning.modules import Actor, Critic from learning.utils import remove_zero_weighted_rewards @@ -10,28 +10,29 @@ def __init__(self, env, train_cfg, device="cpu"): self.env = env self.parse_train_cfg(train_cfg) - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) - actor_critic = ActorCritic( - num_actor_obs, num_critic_obs, num_actions, **self.policy_cfg - ).to(self.device) - - alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] self.tot_timesteps = 0 self.it = 0 self.log_dir = train_cfg["log_dir"] + self._set_up_alg() + + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] - remove_zero_weighted_rewards(train_cfg["policy"]["reward"]["weights"]) - self.policy_cfg = train_cfg["policy"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + self.actor_cfg = train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] def init_storage(self): raise NotImplementedError diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index cdd30400..887df8b2 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -24,9 +24,9 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations rewards_dict self.save() @@ -58,9 +58,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -74,9 +74,9 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["actor_obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -116,9 +116,9 @@ def learn(self): self.save() def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 9a738b93..6d8edf35 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -25,8 +25,8 @@ def learn(self): rewards_dict = {} self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -40,17 +40,17 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -84,21 +84,21 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( @@ -134,16 +134,16 @@ def switch_to_eval(self): self.alg.actor_critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor_critic.actor.act_inference(obs) def export(self, path): self.alg.actor_critic.export_policy(path) def init_storage(self): - num_actor_obs = self.get_obs_size(self.policy_cfg["actor_obs"]) - num_critic_obs = self.get_obs_size(self.policy_cfg["critic_obs"]) - num_actions = self.get_action_size(self.policy_cfg["actions"]) + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) self.alg.init_storage( self.env.num_envs, self.num_steps_per_env, diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index b93ff0de..d11fe85d 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -26,9 +26,9 @@ def learn(self): rewards_dict = {} - self.alg.actor_critic.train() - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations rewards_dict self.save() @@ -60,9 +60,9 @@ def learn(self): for i in range(self.num_steps_per_env): actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) transition.update( @@ -76,9 +76,10 @@ def learn(self): self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + # * get time_outs timed_out = self.get_timed_out() terminated = self.get_terminated() @@ -120,29 +121,27 @@ def learn(self): def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["termination_weight"], mask=terminated + self.critic_cfg["reward"]["termination_weight"], mask=terminated ) ) rewards_dict.update( self.get_rewards( - self.policy_cfg["reward"]["weights"], + self.critic_cfg["reward"]["weights"], modifier=self.env.dt, mask=~terminated, ) ) def set_up_logger(self): - logger.register_rewards(list(self.policy_cfg["reward"]["weights"].keys())) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( - list(self.policy_cfg["reward"]["termination_weight"].keys()) + list(self.critic_cfg["reward"]["termination_weight"].keys()) ) logger.register_rewards(["total_rewards"]) logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @@ -172,7 +171,7 @@ def switch_to_eval(self): self.alg.critic.eval() def get_inference_actions(self): - obs = self.get_noisy_obs(self.policy_cfg["actor_obs"], self.policy_cfg["noise"]) + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor.act_inference(obs) def export(self, path): diff --git a/learning/utils/PBRS/test_usecase.py b/learning/utils/PBRS/test_usecase.py index a84ef547..be5d0412 100644 --- a/learning/utils/PBRS/test_usecase.py +++ b/learning/utils/PBRS/test_usecase.py @@ -30,14 +30,14 @@ def test_PBRS(): logger.initialize(num_envs=1, episode_dt=0.1, total_iterations=100, device="cpu") self_env = MyTask() - # let's create a dummy policy_cfg with just the reward weights - policy_cfg = {"reward": {"weights": {"dummy": 1}}} - policy_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} + # let's create a dummy critic_cfg with just the reward weights + critic_cfg = {"reward": {"weights": {"dummy": 1}}} + critic_cfg["reward"]["pbrs_weights"] = {"first": 1, "second": 2} # and a dummy env PBRS = PotentialBasedRewardShaping( - policy_cfg["reward"]["pbrs_weights"], device="cpu" + critic_cfg["reward"]["pbrs_weights"], device="cpu" ) assert PBRS.get_reward_keys() == ["PBRS_first", "PBRS_second"] diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..4486daae 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -46,9 +46,9 @@ def play(env, runner, train_cfg): if env.cfg.viewer.record: recorder.update(i) runner.set_actions( - runner.policy_cfg["actions"], + runner.actor_cfg["actions"], runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], + runner.actor_cfg["disable_actions"], ) env.step() env.check_exit() diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index ef2fc6bb..f2e607c6 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -27,9 +27,9 @@ def learn_policy(args): def load_saved_policy(runner): - num_actor_obs = runner.get_obs_size(runner.policy_cfg["actor_obs"]) - num_actions = runner.get_action_size(runner.policy_cfg["actions"]) - actor = Actor(num_actor_obs, num_actions, **runner.policy_cfg).to(runner.device) + num_actor_obs = runner.get_obs_size(runner.actor_cfg["obs"]) + num_actions = runner.get_action_size(runner.actor_cfg["actions"]) + actor = Actor(num_actor_obs, num_actions, **runner.actor_cfg).to(runner.device) resume_path = get_load_path( name=runner.cfg["experiment_name"], load_run=runner.cfg["load_run"], @@ -57,7 +57,7 @@ def test_default_integration_settings(self, args): with torch.no_grad(): actions = runner.get_inference_actions() - deployed_actions = runner.env.get_states(runner.policy_cfg["actions"]) + deployed_actions = runner.env.get_states(runner.actor_cfg["actions"]) assert ( torch.equal(actions, torch.zeros_like(actions)) is False ), "Policy returning all zeros" @@ -92,7 +92,7 @@ def test_default_integration_settings(self, args): f"{model_5_path}" "(last iteration) was not saved" ) - obs = torch.randn_like(runner.get_obs(runner.policy_cfg["actor_obs"])) + obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"])) actions_first = runner.alg.actor.act_inference(obs).cpu().clone() runner.load(model_8_path) actions_loaded = runner.alg.actor.act_inference(obs).cpu().clone() @@ -118,7 +118,7 @@ def test_default_integration_settings(self, args): ) # compute torch output with torch.no_grad(): - test_input = runner.get_obs(runner.policy_cfg["actor_obs"])[0:1] + test_input = runner.get_obs(runner.actor_cfg["obs"])[0:1] runner_out = runner.alg.actor.act_inference(test_input) loaded_out = loaded_actor.act_inference(test_input) @@ -128,8 +128,8 @@ def test_default_integration_settings(self, args): # compare ONNX Runtime and PyTorch results torch.testing.assert_close( - runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), loaded_out.cpu().numpy(), rtol=1e-05, atol=1e-7 ) torch.testing.assert_close( - runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-8 + runner_out.cpu().numpy(), ort_out[0], rtol=1e-05, atol=1e-7 ) diff --git a/tests/regression_tests/run_n_iterations.py b/tests/regression_tests/run_n_iterations.py index 6a3a4965..29e2f424 100644 --- a/tests/regression_tests/run_n_iterations.py +++ b/tests/regression_tests/run_n_iterations.py @@ -33,7 +33,7 @@ def _run_training_test(output_tensor_file, env, runner): runner.learn() # * get the test values after learning - actions = runner.env.get_states(runner.policy_cfg["actions"]) + actions = runner.env.get_states(runner.actor_cfg["actions"]) # * return the values to the parent for assertion actions.detach().cpu() From 53e92bcd82640effcbde755af93ac43d999bdf3b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 11:13:41 -0400 Subject: [PATCH 31/69] put critic loss_fn into critic module, remove value clipping (very quick testing, clipping value doesn't seem to help) --- gym/envs/a1/a1_config.py | 3 ++- gym/envs/anymal_c/flat/anymal_c_flat_config.py | 4 +++- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/algorithms/ppo2.py | 15 +++------------ learning/modules/critic.py | 3 +++ learning/runners/old_policy_runner.py | 15 +++++++++++++-- learning/runners/on_policy_runner.py | 4 ++++ 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/gym/envs/a1/a1_config.py b/gym/envs/a1/a1_config.py index 190ce41a..87477b56 100644 --- a/gym/envs/a1/a1_config.py +++ b/gym/envs/a1/a1_config.py @@ -137,6 +137,7 @@ class scaling(LeggedRobotCfg.scaling): class A1RunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 256] @@ -216,7 +217,7 @@ class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "a1" max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" # per iteration # (n_steps in Rudin 2021 paper - batch_size = n_steps * n_robots) num_steps_per_env = 24 diff --git a/gym/envs/anymal_c/flat/anymal_c_flat_config.py b/gym/envs/anymal_c/flat/anymal_c_flat_config.py index 95d95be9..d18f7cca 100644 --- a/gym/envs/anymal_c/flat/anymal_c_flat_config.py +++ b/gym/envs/anymal_c/flat/anymal_c_flat_config.py @@ -85,6 +85,7 @@ class push_robots: toggle = True interval_s = 1 max_push_vel_xy = 0.5 + push_box_dims = [0.2, 0.2, 0.2] class domain_rand(LeggedRobotCfg.domain_rand): randomize_base_mass = True @@ -120,6 +121,7 @@ class scaling(LeggedRobotCfg.scaling): class AnymalCFlatRunnerCfg(LeggedRobotRunnerCfg): seed = -1 + runner_class_name = "OldPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): hidden_dims = [256, 256, 256] @@ -199,6 +201,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "flat_anymal_c" - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" max_iterations = 1000 # number of policy updates num_steps_per_env = 24 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 56d369c0..3c92882d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -140,7 +140,7 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # training params value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True + use_clipped_value_loss = True # deprecate for PPO2 clip_param = 0.2 entropy_coef = 0.01 num_learning_epochs = 6 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 98a4904c..971e6241 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -24,6 +24,7 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, + loss_fn="MSE", device="cpu", **kwargs, ): @@ -35,8 +36,8 @@ def __init__( # * PPO components self.actor = actor.to(self.device) - self.critic = critic.to(self.device) self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic = critic.to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) # * PPO parameters @@ -83,17 +84,7 @@ def update_critic(self, data, last_obs=None): generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: value_batch = self.critic.evaluate(batch["critic_obs"]) - if self.use_clipped_value_loss: - target_value_batch = batch["values"] - value_clipped = target_value_batch + ( - value_batch - target_value_batch - ).clamp(-self.clip_param, self.clip_param) - value_losses = (value_batch - target_value_batch).pow(2) - value_losses_clipped = (value_clipped - batch["returns"]).pow(2) - value_loss = torch.max(value_losses, value_losses_clipped).mean() - else: - value_loss = (batch["returns"] - value_batch).pow(2).mean() - + value_loss = self.critic.loss_fn(value_batch, batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 0bc890c7..732f8eda 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -25,3 +25,6 @@ def evaluate(self, critic_observations): with torch.no_grad(): critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() + + def loss_fn(self, input, target): + return nn.functional.mse_loss(input, target, reduction="mean") diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 6d8edf35..42f019a3 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -2,8 +2,9 @@ import torch from learning.utils import Logger - from .BaseRunner import BaseRunner +from learning.algorithms import PPO # noqa: F401 +from learning.modules import ActorCritic, Actor, Critic logger = Logger() @@ -19,6 +20,16 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + actor_critic = ActorCritic(actor, critic) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor_critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() @@ -70,7 +81,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index d11fe85d..5085fe91 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -153,6 +153,7 @@ def save(self): "actor_state_dict": self.alg.actor.state_dict(), "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), "iter": self.it, }, path, @@ -164,6 +165,9 @@ def load(self, path, load_optimizer=True): self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) self.it = loaded_dict["iter"] def switch_to_eval(self): From 06e62e3ea3dad16b351bd97f1e8c0ea4dd9d9292 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 25 Mar 2024 11:29:11 -0400 Subject: [PATCH 32/69] several missing small fixes (started cherry picking one commit late) --- gym/envs/__init__.py | 5 +- gym/envs/base/fixed_robot.py | 14 ++--- .../mit_humanoid/humanoid_running_config.py | 2 +- gym/envs/pendulum/pendulum.py | 43 ++++++++++++++ .../logging_and_saving/wandb_singleton.py | 2 +- learning/algorithms/ppo2.py | 2 +- learning/modules/actor_critic.py | 35 ++---------- learning/modules/utils/normalize.py | 2 +- learning/runners/my_runner.py | 10 +--- learning/runners/on_policy_runner.py | 2 - resources/robots/pendulum/urdf/pendulum.urdf | 56 +++++++++++++++++++ scripts/train.py | 1 + 12 files changed, 121 insertions(+), 53 deletions(-) create mode 100644 gym/envs/pendulum/pendulum.py create mode 100644 resources/robots/pendulum/urdf/pendulum.urdf diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 80ede800..57cbd8c0 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,6 +19,7 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", + "Pendulum": ".pendulum.pendulum", } config_dict = { @@ -30,6 +31,7 @@ "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", + "PendulumCfg": ".pendulum.pendulum_config", } runner_config_dict = { @@ -41,6 +43,7 @@ "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", + "PendulumRunnerCfg": ".pendulum.pendulum_config", } task_dict = { @@ -62,8 +65,8 @@ "HumanoidRunningCfg", "HumanoidRunningRunnerCfg", ], - "a1": ["A1", "A1Cfg", "A1RunnerCfg"], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 99c038d9..1fe4c5d5 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -592,11 +592,11 @@ def _sqrdexp(self, x, scale=1.0): def _reward_torques(self): """Penalize torques""" - return -torch.sum(torch.square(self.torques), dim=1) + return -torch.mean(torch.square(self.torques), dim=1) def _reward_dof_vel(self): """Penalize dof velocities""" - return -torch.sum(torch.square(self.dof_vel), dim=1) + return -torch.mean(torch.square(self.dof_vel), dim=1) def _reward_action_rate(self): """Penalize changes in actions""" @@ -609,7 +609,7 @@ def _reward_action_rate(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" @@ -623,11 +623,11 @@ def _reward_action_rate2(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_collision(self): """Penalize collisions on selected bodies""" - return -torch.sum( + return -torch.mean( 1.0 * ( torch.norm( @@ -649,11 +649,11 @@ def _reward_dof_pos_limits(self): max=0.0 ) # lower limit out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0) - return -torch.sum(out_of_limits, dim=1) + return -torch.mean(out_of_limits, dim=1) def _reward_dof_vel_limits(self): """Penalize dof velocities too close to the limit""" # * clip to max error = 1 rad/s per joint to avoid huge penalties limit = self.cfg.reward_settings.soft_dof_vel_limit error = self.dof_vel.abs() - self.dof_vel_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 170e2570..8f072f79 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -292,7 +292,7 @@ class runner(LeggedRobotRunnerCfg.runner): algorithm_class_name = "PPO2" num_steps_per_env = 32 max_iterations = 1000 - run_name = "ICRA2023" + run_name = "HumanoidRunning" experiment_name = "HumanoidLocomotion" save_interval = 50 plot_input_gradients = False diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py new file mode 100644 index 00000000..95af1fb2 --- /dev/null +++ b/gym/envs/pendulum/pendulum.py @@ -0,0 +1,43 @@ +import torch + +from gym.envs.base.fixed_robot import FixedRobot + + +class Pendulum(FixedRobot): + def _post_physics_step(self): + """Update all states that are not handled in PhysX""" + super()._post_physics_step() + + def _reward_theta(self): + theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] + return self._sqrdexp(theta_rwd.squeeze(dim=-1)) + + def _reward_omega(self): + omega_rwd = torch.square(self.dof_vel[:, 0] / self.scales["dof_vel"]) + return self._sqrdexp(omega_rwd.squeeze(dim=-1)) + + def _reward_equilibrium(self): + error = torch.abs(self.dof_state) + error[:, 0] /= self.scales["dof_pos"] + error[:, 1] /= self.scales["dof_vel"] + return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) + # return torch.exp( + # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma + # ) + + def _reward_torques(self): + """Penalize torques""" + return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) + + def _reward_energy(self): + m_pendulum = 1.0 + l_pendulum = 1.0 + kinetic_energy = ( + 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + ) + potential_energy = ( + m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + ) + desired_energy = m_pendulum * 9.81 * l_pendulum + energy_error = kinetic_energy + potential_energy - desired_energy + return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/utils/logging_and_saving/wandb_singleton.py b/gym/utils/logging_and_saving/wandb_singleton.py index a1b45b8a..bba14fea 100644 --- a/gym/utils/logging_and_saving/wandb_singleton.py +++ b/gym/utils/logging_and_saving/wandb_singleton.py @@ -7,7 +7,7 @@ class WandbSingleton(object): def __new__(self): if not hasattr(self, "instance"): - self.instance = super(WandbSingleton, self).__new__(self) + self.instance = super().__new__(self) self.entity_name = None self.project_name = None self.experiment_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 971e6241..5866052c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -74,7 +74,7 @@ def update(self, data, last_obs=None): self.update_critic(data) self.update_actor(data) - def update_critic(self, data, last_obs=None): + def update_critic(self, data): self.mean_value_loss = 0 counter = 0 diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index ed837f5e..cec8cbb5 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,44 +1,17 @@ import torch.nn as nn -from .actor import Actor -from .critic import Critic - class ActorCritic(nn.Module): - def __init__( - self, - num_actor_obs, - num_critic_obs, - num_actions, - actor_hidden_dims=[256, 256, 256], - critic_hidden_dims=[256, 256, 256], - activation="elu", - init_noise_std=1.0, - normalize_obs=True, - **kwargs, - ): + def __init__(self, actor, critic, **kwargs): if kwargs: print( "ActorCritic.__init__ got unexpected arguments, " "which will be ignored: " + str([key for key in kwargs.keys()]) ) - super(ActorCritic, self).__init__() - - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) - - self.critic = Critic( - num_critic_obs, critic_hidden_dims, activation, normalize_obs - ) + super().__init__() - print(f"Actor MLP: {self.actor.NN}") - print(f"Critic MLP: {self.critic.NN}") + self.actor = actor + self.critic = critic @property def action_mean(self): diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index d00b69b5..246bafa4 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -4,7 +4,7 @@ class RunningMeanStd(nn.Module): def __init__(self, num_items, epsilon=1e-05): - super(RunningMeanStd, self).__init__() + super().__init__() self.num_items = num_items self.epsilon = epsilon diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 887df8b2..d97dee8d 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -28,7 +28,6 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) tot_iter = self.it + self.num_learning_iterations - rewards_dict self.save() # * start up storage @@ -112,7 +111,6 @@ def learn(self): if self.it % self.save_interval == 0: self.save() - storage.clear() self.save() def set_up_logger(self): @@ -124,10 +122,6 @@ def set_up_logger(self): logger.register_category( "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] ) - logger.register_category( - "actor", self.alg.actor_critic, ["action_std", "entropy"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) - logger.attach_torch_obj_to_wandb( - (self.alg.actor_critic.actor, self.alg.actor_critic.critic) - ) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 5085fe91..2cd6a9d8 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -30,7 +30,6 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - rewards_dict self.save() # * start up storage @@ -115,7 +114,6 @@ def learn(self): if self.it % self.save_interval == 0: self.save() - storage.clear() self.save() def update_rewards(self, rewards_dict, terminated): diff --git a/resources/robots/pendulum/urdf/pendulum.urdf b/resources/robots/pendulum/urdf/pendulum.urdf new file mode 100644 index 00000000..90e894e4 --- /dev/null +++ b/resources/robots/pendulum/urdf/pendulum.urdf @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/train.py b/scripts/train.py index dfa6afea..283717e8 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -14,6 +14,7 @@ def setup(): env = task_registry.make_env(name=args.task, env_cfg=env_cfg) randomize_episode_counters(env) + randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) From a443470d5b53dfac81a96699b30a4d6b40491b49 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Thu, 4 Apr 2024 19:05:41 -0400 Subject: [PATCH 33/69] custom critics handle varied batch size --- learning/modules/lqrc/custom_critics.py | 69 +++++++++++++----------- learning/runners/critic_only_runner.py | 7 ++- learning/runners/custom_critic_runner.py | 2 +- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py index 2604f796..73e8e4df 100644 --- a/learning/modules/lqrc/custom_critics.py +++ b/learning/modules/lqrc/custom_critics.py @@ -80,23 +80,26 @@ def forward(self, x): output = self.connection_3(output) output = self.activation_3(output) C = self.create_cholesky(output) - A = C.bmm(C.transpose(1, 2)) - y_pred = (x.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x.unsqueeze(2))).squeeze(2) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + y_pred = torch.einsum("...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", + x.unsqueeze(-1).transpose(-2, -1), + A), + x.unsqueeze(-1)).squeeze(-1) return y_pred def create_cholesky(self, x): - batch_size = x.shape[0] n = self.input_size - L = torch.zeros((batch_size, n, n), device=self.device, requires_grad=False) + L = torch.zeros((*x.shape[:-1], n, n), device=self.device, requires_grad=False) tril_indices = torch.tril_indices(row=n, col=n, offset=0) rows, cols = tril_indices - L[:, rows, cols] = x + L[..., rows, cols] = x return L def save_intermediate(self): def hook(module, input, output): C = self.create_cholesky(output) - A = C.bmm(C.transpose(1, 2)) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) self.intermediates["A"] = A return hook @@ -132,19 +135,21 @@ def forward(self, x): output = self.activation_2(output) output = self.connection_3(output) output = self.activation_3(output) - C = self.create_cholesky(output[:, :-1]) - A = C.bmm(C.transpose(1, 2)) - c = output[:, -1] - y_pred = (x.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x.unsqueeze(2))).squeeze( - 2 - ) + c.unsqueeze(1) - return y_pred + C = self.create_cholesky(output[..., :-1]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + c = output[..., -1] + y_pred = torch.einsum("...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", + x.unsqueeze(-1).transpose(-2, -1), + A), + x.unsqueeze(-1)) + c.unsqueeze(-1).unsqueeze(-1) + return y_pred.squeeze(-1) def save_intermediate(self): def hook(module, input, output): - C = self.create_cholesky(output[:, :-1]) - A = C.bmm(C.transpose(1, 2)) - c = output[:, -1] + C = self.create_cholesky(output[..., :-1]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + c = output[..., -1] self.intermediates["A"] = A self.intermediates["c"] = c @@ -186,20 +191,22 @@ def forward(self, x): output = self.activation_2(output) output = self.connection_3(output) output = self.activation_3(output) - C = self.create_cholesky(output[:, : self.L_indices]) - A = C.bmm(C.transpose(1, 2)) - offset = output[:, self.L_indices:] + C = self.create_cholesky(output[..., :self.L_indices]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + offset = output[..., self.L_indices:] x_bar = x - offset - y_pred = ( - x_bar.unsqueeze(2).transpose(1, 2).bmm(A).bmm(x_bar.unsqueeze(2)) - ).squeeze(2) + y_pred = torch.einsum("...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", + x_bar.unsqueeze(-1).transpose(-2, -1), + A), + x_bar.unsqueeze(-1)).squeeze(-1) return y_pred def save_intermediate(self): def hook(module, input, output): - C = self.create_cholesky(output[:, : self.L_indices]) - A = C.bmm(C.transpose(1, 2)) - offset = output[:, self.L_indices :] + C = self.create_cholesky(output[..., :self.L_indices]) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + offset = output[..., self.L_indices:] self.intermediates["A"] = A self.intermediates["offset"] = offset @@ -230,16 +237,18 @@ def forward(self, x): output = self.connection_3(output) output = self.activation_3(output) C = self.create_cholesky(output) - A = C.bmm(C.transpose(1, 2)) - y_pred = ( - xhat.unsqueeze(2).transpose(1, 2).bmm(A).bmm(xhat.unsqueeze(2)) - ).squeeze(2) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) + y_pred = torch.einsum("...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", + xhat.unsqueeze(-1).transpose(-2, -1), + A), + xhat.unsqueeze(-1)).squeeze(-1) return y_pred def save_intermediate(self): def hook(module, input, output): C = self.create_cholesky(output) - A = C.bmm(C.transpose(1, 2)) + A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) self.intermediates["A"] = A return hook diff --git a/learning/runners/critic_only_runner.py b/learning/runners/critic_only_runner.py index 08af8702..b37d90e9 100644 --- a/learning/runners/critic_only_runner.py +++ b/learning/runners/critic_only_runner.py @@ -6,7 +6,12 @@ from gym import LEGGED_GYM_ROOT_DIR from learning.algorithms import * # noqa: F403 from learning.modules import Actor, Critic -from learning.modules.lqrc import Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 +from learning.modules.lqrc import ( + Cholesky, + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2 +) from learning.utils import Logger from .on_policy_runner import OnPolicyRunner diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py index 82659b86..8fad45a1 100644 --- a/learning/runners/custom_critic_runner.py +++ b/learning/runners/custom_critic_runner.py @@ -141,7 +141,7 @@ def learn(self): "logs", "lqrc", "standard_training_data.npy" - if self.critic_cfg["standard_critic_nn"] + if "Cholesky" not in self.critic_cfg["critic_class_name"] else "custom_training_data.npy", ) From 7cb532d8685eb37e66d028b09d368c3d215548ba Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 5 Apr 2024 18:25:28 -0400 Subject: [PATCH 34/69] update evaluate_critic.py and minor fixes --- gym/utils/helpers.py | 4 +- learning/modules/lqrc/custom_critics.py | 2 +- learning/modules/lqrc/evaluate_critic.py | 56 +++++++----------------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 6f540cb0..7236d169 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -190,10 +190,10 @@ def update_cfg_from_args(env_cfg, train_cfg, args): train_cfg.runner_class_name = "CriticOnlyRunner" train_cfg.runner.resume = True train_cfg.runner.experiment_name += "_critic_only" - train_cfg.runner.run_name += "custom_critic_only" if args.task == "pendulum" and args.custom_critic else "standard_critic_only" + train_cfg.runner.run_name += f"{train_cfg.critic.critic_class_name}_custom_critic_only" if args.task == "pendulum" and args.custom_critic else "standard_critic_only" else: train_cfg.runner.experiment_name += "_custom_critic" if args.task == "pendulum" and args.custom_critic else "_standard_critic" - train_cfg.runner.run_name += "custom_critic" if args.task == "pendulum" and args.custom_critic else "standard_critic" + train_cfg.runner.run_name += f"{train_cfg.critic.critic_class_name}_custom_critic" if args.task == "pendulum" and args.custom_critic else "standard_critic" def get_args(custom_parameters=None): diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py index 73e8e4df..d29eed7f 100644 --- a/learning/modules/lqrc/custom_critics.py +++ b/learning/modules/lqrc/custom_critics.py @@ -126,7 +126,7 @@ def __init__( device=device, ) self.activation_3.register_forward_hook(self.save_intermediate()) - self.const_penalty = 0.0 if kwargs.get("const_penalty") is None else kwargs.get("const_penalty") + self.const_penalty = 0.1 # 0.0 if kwargs.get("const_penalty") is None else kwargs.get("const_penalty") def forward(self, x): output = self.connection_1(x) diff --git a/learning/modules/lqrc/evaluate_critic.py b/learning/modules/lqrc/evaluate_critic.py index d9eb4023..32ebead4 100644 --- a/learning/modules/lqrc/evaluate_critic.py +++ b/learning/modules/lqrc/evaluate_critic.py @@ -8,7 +8,10 @@ plot_critic_prediction_only, ) from learning.modules import Critic - +from learning.modules.lqrc.custom_critics import ( + CustomCriticBaseline, Cholesky, CholeskyPlusConst, CholeskyOffset1, + CholeskyOffset2 +) import torch @@ -32,10 +35,14 @@ def filter_state_dict(state_dict): def model_switch(args): - if args["model_type"] == "CholeskyPlusConst": - return Critic(2, standard_nn=False).to(DEVICE) - elif args["model_type"] == "StandardMLP": - return Critic(2, [512, 256, 128], "elu", standard_nn=True).to(DEVICE) + if args["model_type"] == "StandardMLP": + return Critic(2, [128, 64, 32], "tanh").to(DEVICE) + elif args["model_type"] == "CholeskyPlusConst": + return CholeskyPlusConst(2).to(DEVICE) + elif args["model_type"] == "CholeskyOffset1": + return CholeskyOffset1(2).to(DEVICE) + elif args["model_type"] == "CholeskyOffset2": + return CholeskyOffset2(2).to(DEVICE) else: raise KeyError("Specified model type is not supported for critic evaluation.") @@ -46,27 +53,19 @@ def model_switch(args): path = get_load_path(args["experiment_name"], args["load_run"], args["checkpoint"]) model = model_switch(args) - loaded_dict = torch.load(path) - critic_state_dict = filter_state_dict(loaded_dict["model_state_dict"]) + critic_state_dict = torch.load(path)["critic_state_dict"] model.load_state_dict(critic_state_dict) - lb = [0.0, -100.0] - ub = [2.0 * torch.pi, 100.0] + lb = [-torch.pi, -8.0] + ub = [torch.pi, 8.0] x = generate_data(lb, ub) - x_norm = model.normalize(x) + # x_norm = model.normalize(x) y_pred = [] - A_pred = [] - c_pred = [] model.eval() - # t_c = 0 - # p_c = 0 for X_batch in x: y_hat = model.evaluate(X_batch.unsqueeze(0)) y_pred.append(y_hat) - if model_type == "CholeskyPlusConst": - A_pred.append(model.NN.intermediates["A"]) - c_pred.append(model.NN.intermediates["c"]) save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc") if not os.path.exists(save_path): @@ -74,27 +73,6 @@ def model_switch(args): fn = args["fn"] - if model_type == "CholeskyPlusConst": - plot_custom_critic( - x, - torch.vstack(y_pred), - ( - x.unsqueeze(2) - .transpose(1, 2) - .bmm(torch.vstack(A_pred)) - .bmm(x.unsqueeze(2)) - ).squeeze(2), - torch.vstack(c_pred), - save_path + f"/{fn}.png", - contour=args["contour"], - ) - plot_critic_prediction_only( - x, - torch.vstack(y_pred), - save_path + f"/{fn}_prediction_only.png", - contour=args["contour"], - ) - else: - plot_critic_prediction_only( + plot_critic_prediction_only( x, torch.vstack(y_pred), save_path + f"/{fn}.png", contour=args["contour"] ) From 92cbbb0eb119b4b3e3ac62e864097e29921315d8 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:58:29 -0400 Subject: [PATCH 35/69] WIP autoencoder --- gym/envs/pendulum/pendulum_config.py | 2 +- learning/modules/lqrc/__init__.py | 1 + learning/modules/lqrc/autoencoder.py | 22 +++++++ learning/modules/lqrc/plotting.py | 59 ++++++------------ learning/modules/lqrc/train_autoencoder.py | 69 ++++++++++++++++++++++ learning/modules/lqrc/utils.py | 52 ++++++++++++++++ 6 files changed, 162 insertions(+), 43 deletions(-) create mode 100644 learning/modules/lqrc/autoencoder.py create mode 100644 learning/modules/lqrc/train_autoencoder.py diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 427891de..76d3d100 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -77,7 +77,7 @@ class noise: dof_vel = 0.0 class critic: - critic_class_name = "Cholesky" + critic_class_name = "" obs = [ "dof_pos", "dof_vel", diff --git a/learning/modules/lqrc/__init__.py b/learning/modules/lqrc/__init__.py index 0286940b..f7050b41 100644 --- a/learning/modules/lqrc/__init__.py +++ b/learning/modules/lqrc/__init__.py @@ -1,3 +1,4 @@ from .custom_nn import CholeskyPlusConst, QuadraticNetCholesky, BaselineMLP, CustomCholeskyLoss, CustomCholeskyPlusConstLoss from .utils import benchmark_args from .custom_critics import CustomCriticBaseline, Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 +from .autoencoder import Autoencoder \ No newline at end of file diff --git a/learning/modules/lqrc/autoencoder.py b/learning/modules/lqrc/autoencoder.py new file mode 100644 index 00000000..65d5f8cc --- /dev/null +++ b/learning/modules/lqrc/autoencoder.py @@ -0,0 +1,22 @@ +import torch +import torch.nn as nn + +from learning.modules.utils.neural_net import create_MLP + + +class Autoencoder(nn.Module): + def __init__( + self, + num_inputs, + num_latent, + hidden_dims, + activation="elu" + ): + super().__init__() + self.encoder = create_MLP(num_inputs, num_latent, hidden_dims, activation) + self.decoder = create_MLP(num_latent, num_inputs, hidden_dims, activation) + + def forward(self, x): + x = self.encoder(x) + x = self.decoder(x) + return x diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 2f60562c..475bf9ca 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -336,55 +336,30 @@ def plot_predictions_and_gradients( print("Dimension less than 2, cannot graph") -def plot_pointwise_predictions(dim, x_actual, y_pred, y_actual, fn): - if dim > 3: - print("Dimension greater than 3, cannot graph") - elif dim == 3: - x_actual = x_actual.detach().cpu().numpy() - y_pred = y_pred.detach().cpu().numpy() - y_actual = y_actual.detach().cpu().numpy() +def plot_autoencoder(targets, predictions, fn): + targets = targets.detach().cpu().numpy() + predictions = predictions.detach().cpu().numpy() - fig = plt.figure(constrained_layout=True) - ax = fig.add_subplot(projection="3d") - ax.scatter( - x_actual[:, 0], x_actual[:, 1], y_pred, marker="o", label="Predicted" - ) - ax.scatter(x_actual[:, 0], x_actual[:, 1], y_actual, marker="^", label="Actual") - ax.set_xlim(-15, 15) - ax.set_ylim(-15, 15) - ax.set_zlim(-15, 250) - ax.set_xlabel("x") - ax.set_ylabel("y") - ax.set_zlabel("f(x, y)") - ax.legend(loc="upper left") - ax.set_title("Pointwise Predictions vs Actual") - plt.legend(loc="upper left") - plt.savefig(f"{fn}.png") - elif dim == 2: - x_actual = x_actual.detach().cpu().numpy() - y_pred = y_pred.detach().cpu().numpy() - y_actual = y_actual.detach().cpu().numpy() - - _, ax = plt.subplots() - ax.scatter(x_actual, y_pred, label="Predicted") - ax.scatter(x_actual, y_actual, label="Actual", alpha=0.5) - ax.set_ylim(-10, 250) - ax.set_xlim(-15, 15) - ax.set_ylabel("y") - ax.set_xlabel("x") - ax.legend(loc="upper left") - ax.set_title("Pointwise Predictions vs Actual") - plt.savefig(f"{fn}.png") - else: - print("Dimension less than 2, cannot graph") + _, ax = plt.subplots() + ax.scatter(targets[:, 0], targets[:, 1], label="Targets", alpha=0.5) + ax.scatter(predictions[:, 0], predictions[:, 1], label="Predictions", alpha=0.5) + ax.set_xlim(min(np.min(targets[:, 0]), np.min(predictions[:, 0])), + max(np.max(targets[:, 0]), np.max(predictions[:, 0]))) + ax.set_ylim(min(np.min(targets[:, 1]), np.min(predictions[:, 1])), + max(np.max(targets[:, 1]), np.max(predictions[:, 1]))) + ax.set_xlabel("theta") + ax.set_ylabel("theta dot") + ax.legend(loc="upper left") + ax.set_title("Pointwise Predictions vs Actual") + plt.savefig(f"{fn}.png") -def plot_loss(loss_arr, fn): +def plot_loss(loss_arr, fn, title="Training Loss Across Epochs"): _, ax = plt.subplots() ax.plot(loss_arr) ax.set_ylim(min(0, min(loss_arr) - 1.0), max(loss_arr) + 1.0) ax.set_xlim(0, len(loss_arr)) ax.set_ylabel("Average Batch MSE") ax.set_xlabel("Epoch") - ax.set_title("Training Loss Across Epochs") + ax.set_title(title) plt.savefig(f"{fn}.png") diff --git a/learning/modules/lqrc/train_autoencoder.py b/learning/modules/lqrc/train_autoencoder.py new file mode 100644 index 00000000..43e1fbdd --- /dev/null +++ b/learning/modules/lqrc/train_autoencoder.py @@ -0,0 +1,69 @@ +import os +import time +import torch +import torch.nn as nn +from autoencoder import Autoencoder +from gym import LEGGED_GYM_ROOT_DIR +from utils import autoencoder_args +from plotting import plot_loss, plot_autoencoder + + +def generate_data(lb, ub, n): + return (ub - lb).expand(n, -1)*torch.rand(n, ub.shape[-1]) + lb + + +if __name__ == "__main__": + args = vars(autoencoder_args()) + num_epochs = args["epochs"] + num_batches = args["batches"] + input_dim = args["input_dim"] + latent_dim = args["latent_dim"] + n = args["n"] + lb = torch.tensor([-torch.pi, -8.0]) + ub = torch.tensor([torch.pi, 8.0]) + + model = Autoencoder(input_dim, latent_dim, [16, 8, 4, 2]) + + loss = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.03) + training_loss = [] + for epoch in range(num_epochs): + batch_loss = [] + for batch in range(num_batches): + target = generate_data(lb, ub, n) + prediction = model(target) + loss = nn.MSELoss(reduction="mean")(target, prediction) + batch_loss.append(loss) + optimizer.zero_grad() + loss.backward() + optimizer.step() + training_loss.append(torch.mean(torch.tensor(batch_loss)).item()) + if epoch % 250 == 0: + print(f"Finished epoch {epoch}, latest loss {training_loss[-1]}") + print(f"Training finished with final loss of {training_loss[-1]}") + + test_loss = [] + targets = generate_data(lb, ub, 100) + predictions = torch.zeros_like(targets) + for ix, target in enumerate(targets): + prediction = model(target) + predictions[ix, :] = prediction + loss = nn.MSELoss(reduction="mean")(target, prediction) + test_loss.append(loss) + + print(f"Average loss on test set {torch.mean(torch.tensor(test_loss))}") + + save_str = "autoencoder" + time_str = time.strftime("%Y%m%d_%H%M%S") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "autoencoders", save_str) + if not os.path.exists(save_path): + os.makedirs(save_path) + + plot_loss(training_loss, f"{save_path}/{save_str}_loss") + plot_autoencoder(targets, predictions, f"{save_path}/{save_str}_predictions") + + if args["save_model"]: + torch.save( + model.state_dict(), + save_path + f"/model_{time_str}" + ".pt", + ) diff --git a/learning/modules/lqrc/utils.py b/learning/modules/lqrc/utils.py index 8e7cd9c7..82c879bd 100644 --- a/learning/modules/lqrc/utils.py +++ b/learning/modules/lqrc/utils.py @@ -112,6 +112,58 @@ def critic_eval_args(): return args +def autoencoder_args(): + parser = argparse.ArgumentParser( + description="Toggle between autoencoder training settings." + ) + parser.add_argument( + "--input_dim", + action="store", + type=int, + nargs="?", + default=1, + help="Dimenison of the input variables", + ) + parser.add_argument( + "--latent_dim", + action="store", + type=int, + nargs="?", + default=1, + help="Dimenison of the input variables", + ) + parser.add_argument( + "--epochs", + action="store", + type=int, + nargs="?", + default=1000, + help="Number of training epochs", + ) + parser.add_argument( + "--batches", + action="store", + type=int, + nargs="?", + default=4, + help="Number of randomly generated batches per epoch.", + ) + parser.add_argument( + "--n", + action="store", + type=int, + nargs="?", + default=100, + help="Number of training epochs", + ) + parser.add_argument("--save_model", action="store_true", help="Save the model") + try: + args = parser.parse_args() + except SystemExit: + print("Exception occurred! You've likely mispelled a key.") + return args + + def get_load_path(name, load_run=-1, checkpoint=-1): root = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", name) run_path = select_run(root, load_run) From 02d5bb41e59745fb90e4f9c4b5ad2bcebf85f654 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Tue, 16 Apr 2024 20:04:43 -0400 Subject: [PATCH 36/69] WIP torch.vmap based LQR, new LQR pendulum environment --- gym/envs/__init__.py | 6 +- gym/envs/pendulum/lqr_pendulum.py | 82 ++++++++++++++ gym/envs/pendulum/lqr_pendulum_config.py | 126 +++++++++++++++++++++ gym/envs/pendulum/pendulum.py | 8 +- gym/envs/pendulum/pendulum_config.py | 2 + learning/modules/lqrc/train_autoencoder.py | 12 +- 6 files changed, 226 insertions(+), 10 deletions(-) create mode 100644 gym/envs/pendulum/lqr_pendulum.py create mode 100644 gym/envs/pendulum/lqr_pendulum_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 57cbd8c0..4b5a1546 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -20,6 +20,7 @@ "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", "Pendulum": ".pendulum.pendulum", + "LQRPendulum": ".pendulum.lqr_pendulum", } config_dict = { @@ -32,6 +33,7 @@ "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", "PendulumCfg": ".pendulum.pendulum_config", + "LQRPendulumCfg": ".pendulum.lqr_pendulum_config", } runner_config_dict = { @@ -44,6 +46,7 @@ "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", + "LQRPendulumRunnerCfg": ".pendulum.lqr_pendulum_config", } task_dict = { @@ -66,7 +69,8 @@ "HumanoidRunningRunnerCfg", ], "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], - "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"] + "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], + "lqr_pendulum": ["LQRPendulum", "LQRPendulumCfg", "LQRPendulumRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py new file mode 100644 index 00000000..e3f9b19f --- /dev/null +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -0,0 +1,82 @@ +import torch +import numpy as np +import torch.linalg as linalg +import scipy + +from torch.func import jacrev +from gym.envs.pendulum.pendulum import Pendulum + + +class LQRPendulum(Pendulum): + + def dynamics(self, theta, theta_dot, u): + m = self.cfg.asset.mass + b = self.cfg.asset.joint_damping + length = self.cfg.asset.length + g = 9.81 + theta_ddot = (1.0/(m*length**2))*(u-b*theta_dot - m*g*length*torch.sin(theta)) + return torch.hstack((theta_dot, theta_ddot)) + + def _compute_torques(self): + """Compute torques from actions. + Actions can be interpreted as position or velocity targets given + to a PD controller, or directly as scaled torques. + [NOTE]: torques must have the same dimension as the number of DOFs, + even if some DOFs are not actuated. + + Args: + actions (torch.Tensor): Actions + + Returns: + [torch.Tensor]: Torques sent to the simulation + """ + actuated_dof_pos = torch.zeros( + self.num_envs, self.num_actuators, device=self.device + ) + actuated_dof_vel = torch.zeros( + self.num_envs, self.num_actuators, device=self.device + ) + for dof_idx in range(self.num_dof): + idx = 0 + if self.cfg.control.actuated_joints_mask[dof_idx]: + actuated_dof_pos[:, idx] = self.dof_pos[:, dof_idx] + actuated_dof_vel[:, idx] = self.dof_vel[:, dof_idx] + idx += 1 + + vectorized_lqr = torch.vmap(self.lqr, in_dims=0, out_dims=0) + Q = torch.eye(self.dof_state.shape[1]).repeat(self.num_envs, 1, 1) + R = torch.eye(self.torques.shape[1]).repeat(self.num_envs, 1, 1) + torques = vectorized_lqr(Q.cpu(), R.cpu(), + torch.zeros(self.dof_state.shape).cpu(), + torch.zeros(self.torques.shape).cpu()) + + torques = torch.clip(torques, -self.torque_limits, self.torque_limits) + return torques.view(self.torques.shape) + + def lqr(self, Q, R, x_d, u_d): + A, B = self.linearize_dynamics(self.dynamics, x_d, u_d) + A = self.detach_numpy(A) + B = self.detach_numpy(B) + Q = self.detach_numpy(Q) + R = self.detach_numpy(R) + x_d = self.detach_numpy(x_d) + u_d = self.detach_numpy(u_d) + S = scipy.linalg.solve_discrete_are(A, B, Q, R) + B_T = B.transpose(-1, -2) + x_bar = (self.dof_state - x_d) + u_prime = u_d - linalg.inv(R + B_T@S@B)@B_T@S@A@x_bar + return u_prime + + def linearize_dynamics(self, dynamics, x_d, u_d): + A = jacrev(dynamics, argnums=0)(*x_d, u_d) + B = jacrev(dynamics, argnums=-1)(*x_d, u_d) + return A, B + + def detach_numpy(self, tensor): + tensor = tensor.detach().cpu() + # if torch._C._functorch.is_gradtrackingtensor(tensor): + # tensor = torch._C._functorch.get_unwrapped(tensor) + # return np.array(tensor.storage().tolist()).reshape(tensor.shape) + tensor = torch._C._functorch.get_unwrapped(tensor) + tensor = np.array(tensor.storage().tolist()).reshape(tensor.shape) + return tensor diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py new file mode 100644 index 00000000..2aa4eeee --- /dev/null +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -0,0 +1,126 @@ +import torch + +from gym.envs.base.fixed_robot_config import FixedRobotCfg, FixedRobotCfgPPO + + +class LQRPendulumCfg(FixedRobotCfg): + class env(FixedRobotCfg.env): + num_envs = 2**12 + num_actuators = 1 # 1 for theta connecting base and pole + episode_length_s = 5.0 + + class terrain(FixedRobotCfg.terrain): + pass + + class viewer: + ref_env = 0 + pos = [10.0, 5.0, 10.0] # [m] + lookat = [0.0, 0.0, 0.0] # [m] + + class init_state(FixedRobotCfg.init_state): + default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position + # * "reset_to_range" = uniformly random from a range defined below + reset_mode = "reset_to_range" + + # * initial conditions for reset_to_range + dof_pos_range = { + "theta": [-torch.pi/60.0, torch.pi/60.0], + } + dof_vel_range = {"theta": [-5, 5]} + + class control(FixedRobotCfg.control): + actuated_joints_mask = [1] # angle + ctrl_frequency = 100 + desired_sim_frequency = 200 + stiffness = {"theta": 0.0} # [N*m/rad] + damping = {"theta": 0.0} # [N*m*s/rad] + + class asset(FixedRobotCfg.asset): + # * Things that differ + file = "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "pendulum/urdf/pendulum.urdf" + disable_gravity = False + disable_motors = False # all torques set to 0 + joint_damping = 0.1 + mass = 1.0 + length = 1.0 + + class reward_settings(FixedRobotCfg.reward_settings): + tracking_sigma = 0.25 + + class scaling(FixedRobotCfg.scaling): + dof_vel = 5.0 + dof_pos = 2.0 * torch.pi + # * Action scales + tau_ff = 1.0 + + +class LQRPendulumRunnerCfg(FixedRobotCfgPPO): + seed = -1 + runner_class_name = "CustomCriticRunner" + + class actor: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + obs = [ + "dof_pos", + "dof_vel", + ] + + actions = ["tau_ff"] + disable_actions = False + + class noise: + dof_pos = 0.0 + dof_vel = 0.0 + + class critic: + critic_class_name = "" + obs = [ + "dof_pos", + "dof_vel", + ] + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" + + class reward: + class weights: + theta = 0.0 + omega = 0.0 + equilibrium = 1.0 + energy = 0.0 + dof_vel = 0.0 + torques = 0.025 + + class termination_weight: + termination = 0.0 + + class algorithm(FixedRobotCfgPPO.algorithm): + # training params + value_loss_coef = 1.0 + use_clipped_value_loss = True + clip_param = 0.2 + entropy_coef = 0.01 + num_learning_epochs = 6 + # * mini batch size = num_envs*nsteps / nminibatches + num_mini_batches = 4 + learning_rate = 1.0e-3 + schedule = "fixed" # could be adaptive, fixed + discount_horizon = 2.0 # [s] + lam = 0.98 + # GAE_bootstrap_horizon = .0 # [s] + desired_kl = 0.01 + max_grad_norm = 1.0 + plus_c_penalty = 0.1 + + class runner(FixedRobotCfgPPO.runner): + run_name = "" + experiment_name = "pendulum" + max_iterations = 500 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 32 diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 5d3c07f6..18f3d6c9 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -30,14 +30,12 @@ def _reward_torques(self): return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), sigma=0.2) def _reward_energy(self): - m_pendulum = 1.0 - l_pendulum = 1.0 kinetic_energy = ( - 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + 0.5 * self.cfg.mass * self.cfg.length**2 * torch.square(self.dof_vel[:, 0]) ) potential_energy = ( - m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + self.cfg.mass * 9.81 * self.cfg.length * torch.cos(self.dof_pos[:, 0]) ) - desired_energy = m_pendulum * 9.81 * l_pendulum + desired_energy = self.cfg.mass * 9.81 * self.cfg.length energy_error = kinetic_energy + potential_energy - desired_energy return self._sqrdexp(energy_error / desired_energy) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 76d3d100..2c974232 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -44,6 +44,8 @@ class asset(FixedRobotCfg.asset): disable_gravity = False disable_motors = False # all torques set to 0 joint_damping = 0.1 + mass = 1.0 + length = 1.0 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 diff --git a/learning/modules/lqrc/train_autoencoder.py b/learning/modules/lqrc/train_autoencoder.py index 43e1fbdd..0b3b7c5f 100644 --- a/learning/modules/lqrc/train_autoencoder.py +++ b/learning/modules/lqrc/train_autoencoder.py @@ -22,10 +22,14 @@ def generate_data(lb, ub, n): lb = torch.tensor([-torch.pi, -8.0]) ub = torch.tensor([torch.pi, 8.0]) - model = Autoencoder(input_dim, latent_dim, [16, 8, 4, 2]) + # model = Autoencoder(input_dim, latent_dim, [16, 8, 4, 2], "elu") + # model = Autoencoder(input_dim, latent_dim, [512, 256, 128], "elu") + model = Autoencoder(input_dim, latent_dim, [64, 32, 16, 8], "elu") + + # ! TODO: train on reference motion swing up trajectories loss = nn.MSELoss() - optimizer = torch.optim.Adam(model.parameters(), lr=0.03) + optimizer = torch.optim.Adam(model.parameters(), lr=0.0003) training_loss = [] for epoch in range(num_epochs): batch_loss = [] @@ -53,9 +57,9 @@ def generate_data(lb, ub, n): print(f"Average loss on test set {torch.mean(torch.tensor(test_loss))}") - save_str = "autoencoder" + save_str = "autoencoders" time_str = time.strftime("%Y%m%d_%H%M%S") - save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "autoencoders", save_str) + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", save_str) if not os.path.exists(save_path): os.makedirs(save_path) From 7448aa602c9e8365575c071b5fef21e572630a46 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 08:44:12 -0400 Subject: [PATCH 37/69] update setup info --- setup.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 8a3685da..06709c4e 100644 --- a/setup.py +++ b/setup.py @@ -2,19 +2,16 @@ from distutils.core import setup setup( - name="pkGym", - version="1.0.1", + name="QGym", + version="1.0.3", author="Biomimetic Robotics Lab", license="BSD-3-Clause", packages=find_packages(), description="Isaac Gym environments", install_requires=[ "isaacgym", - "matplotlib", - "pandas", "setuptools==59.5.0", "torch>=1.4.0", "torchvision>=0.5.0", - "numpy>=1.16.4", ], ) From a948e2a2d48559207d60cf1cb29570f0578d2bc8 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Thu, 18 Apr 2024 12:55:05 -0400 Subject: [PATCH 38/69] WIP LQR Stabilizer --- gym/envs/pendulum/lqr_pendulum.py | 96 ++++++++++++++++-------- gym/envs/pendulum/lqr_pendulum_config.py | 8 +- scripts/train.py | 3 +- 3 files changed, 71 insertions(+), 36 deletions(-) diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py index e3f9b19f..f6360bd0 100644 --- a/gym/envs/pendulum/lqr_pendulum.py +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -1,11 +1,12 @@ import torch import numpy as np -import torch.linalg as linalg +import numpy.linalg as linalg import scipy from torch.func import jacrev from gym.envs.pendulum.pendulum import Pendulum +from learning.utils.logger.TimeKeeper import TimeKeeper class LQRPendulum(Pendulum): @@ -14,7 +15,7 @@ def dynamics(self, theta, theta_dot, u): b = self.cfg.asset.joint_damping length = self.cfg.asset.length g = 9.81 - theta_ddot = (1.0/(m*length**2))*(u-b*theta_dot - m*g*length*torch.sin(theta)) + theta_ddot = (1.0/(m*length**2))*(u-b*theta_dot - m*g*length*torch.sin(theta + (torch.pi/2.0))) return torch.hstack((theta_dot, theta_ddot)) def _compute_torques(self): @@ -30,6 +31,8 @@ def _compute_torques(self): Returns: [torch.Tensor]: Torques sent to the simulation """ + timer = TimeKeeper() + timer.tic("Entered compute_torques()") actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) @@ -43,40 +46,73 @@ def _compute_torques(self): actuated_dof_vel[:, idx] = self.dof_vel[:, dof_idx] idx += 1 - vectorized_lqr = torch.vmap(self.lqr, in_dims=0, out_dims=0) - Q = torch.eye(self.dof_state.shape[1]).repeat(self.num_envs, 1, 1) - R = torch.eye(self.torques.shape[1]).repeat(self.num_envs, 1, 1) - torques = vectorized_lqr(Q.cpu(), R.cpu(), - torch.zeros(self.dof_state.shape).cpu(), - torch.zeros(self.torques.shape).cpu()) + # vectorized_lqr = torch.vmap(self.lqr, in_dims=0, out_dims=0) + x_d = torch.zeros(self.dof_state.shape, device=self.device) + u_d = torch.zeros(self.torques.shape, device=self.device) + Q = torch.eye(self.dof_state.shape[1], device=self.device) + Q[0, 0] = 10.0 + Q = Q.repeat(self.num_envs, 1, 1) + R = torch.eye(self.torques.shape[1], device=self.device).repeat(self.num_envs, 1, 1) + torques = torch.zeros_like(self.torques) + # torques = vectorized_lqr(Q.cpu(), R.cpu(), + # torch.zeros(self.dof_state.shape).cpu(), + # torch.zeros(self.torques.shape).cpu()) + for env in range(self.num_envs): + u_prime = self.lqr(Q[env], + R[env], + self.dof_state[env], + x_d[env], + u_d[env]) + torques[env] = torch.from_numpy(u_prime) - torques = torch.clip(torques, -self.torque_limits, self.torque_limits) + # torques = torch.clip(torques, -self.torque_limits, self.torque_limits) + if torch.any(torques != torch.zeros_like(torques)): + print("torques", torques) + timer.toc("Entered compute_torques()") + # print(f"Time spent computing torques {timer.get_time('Entered compute_torques()')}") return torques.view(self.torques.shape) - def lqr(self, Q, R, x_d, u_d): - A, B = self.linearize_dynamics(self.dynamics, x_d, u_d) - A = self.detach_numpy(A) - B = self.detach_numpy(B) - Q = self.detach_numpy(Q) - R = self.detach_numpy(R) - x_d = self.detach_numpy(x_d) - u_d = self.detach_numpy(u_d) + def lqr(self, Q, R, x, x_d, u_d): + # A, B = self.linearize_dynamics(self.dynamics, x_d, u_d) + A, B = self.linearize_pendulum_dynamics(x_d) + A = A.cpu().detach().numpy() + B = B.cpu().detach().numpy() + Q = Q.cpu().detach().numpy() + R = R.cpu().detach().numpy() + x = x.cpu().detach().numpy() + x_d = x_d.cpu().detach().numpy() + u_d = u_d.cpu().detach().numpy() S = scipy.linalg.solve_discrete_are(A, B, Q, R) B_T = B.transpose(-1, -2) - x_bar = (self.dof_state - x_d) - u_prime = u_d - linalg.inv(R + B_T@S@B)@B_T@S@A@x_bar + x_bar = (x - x_d) + u_prime = u_d - linalg.inv(R)@B_T@S@x_bar return u_prime - def linearize_dynamics(self, dynamics, x_d, u_d): - A = jacrev(dynamics, argnums=0)(*x_d, u_d) - B = jacrev(dynamics, argnums=-1)(*x_d, u_d) + def linearize_pendulum_dynamics(self, x_d): + m = self.cfg.asset.mass + b = self.cfg.asset.joint_damping + length = self.cfg.asset.length + g = 9.81 + A = torch.tensor([[0.0, 1.0], + [(1.0/m*length**2)*(-m*g*length*torch.cos(x_d[0] + torch.pi/2.0)), -(b/m*length**2)]]) + # A = torch.tensor([[0.0, 1.0], + # [(1.0/m*length**2)*(-m*g*length*torch.cos(x_d[0] + torch.pi/2.0)), 0.0]]) + B = torch.tensor([[0.0], + [(1.0/m*length**2)]]) return A, B - def detach_numpy(self, tensor): - tensor = tensor.detach().cpu() - # if torch._C._functorch.is_gradtrackingtensor(tensor): - # tensor = torch._C._functorch.get_unwrapped(tensor) - # return np.array(tensor.storage().tolist()).reshape(tensor.shape) - tensor = torch._C._functorch.get_unwrapped(tensor) - tensor = np.array(tensor.storage().tolist()).reshape(tensor.shape) - return tensor + + # ! address zeroing out + # def linearize_dynamics(self, dynamics, x_d, u_d): + # A = torch.vstack(jacrev(dynamics, argnums=(0, 1))(*x_d, u_d)) + # B = jacrev(dynamics, argnums=(len(x_d) + len(u_d) - 1))(*x_d, u_d) + # return A, B + + # def detach_numpy(self, tensor): + # tensor = tensor.detach().cpu() + # # if torch._C._functorch.is_gradtrackingtensor(tensor): + # # tensor = torch._C._functorch.get_unwrapped(tensor) + # # return np.array(tensor.storage().tolist()).reshape(tensor.shape) + # tensor = torch._C._functorch.get_unwrapped(tensor) + # tensor = np.array(tensor.storage().tolist()).reshape(tensor.shape) + # return tensor diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py index 2aa4eeee..fbdf2597 100644 --- a/gym/envs/pendulum/lqr_pendulum_config.py +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -7,7 +7,7 @@ class LQRPendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): num_envs = 2**12 num_actuators = 1 # 1 for theta connecting base and pole - episode_length_s = 5.0 + episode_length_s = 3.0 class terrain(FixedRobotCfg.terrain): pass @@ -18,16 +18,16 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 1.0e-6} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_basic" # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi/60.0, torch.pi/60.0], + "theta": [-torch.pi/20.0, torch.pi/20.0], } dof_vel_range = {"theta": [-5, 5]} diff --git a/scripts/train.py b/scripts/train.py index 283717e8..7e621dee 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -12,9 +12,8 @@ def setup(): task_registry.make_gym_and_sim() wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) - randomize_episode_counters(env) + # randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) From 8b95b5a04985e432059e9cbd52a4e5ebdb660de1 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Apr 2024 18:06:47 -0400 Subject: [PATCH 39/69] fixed --- gym/envs/pendulum/lqr_pendulum.py | 85 +++++++----------------- gym/envs/pendulum/lqr_pendulum_config.py | 8 +-- scripts/play.py | 2 +- 3 files changed, 28 insertions(+), 67 deletions(-) diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py index f6360bd0..fbd18571 100644 --- a/gym/envs/pendulum/lqr_pendulum.py +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -1,23 +1,11 @@ import torch -import numpy as np import numpy.linalg as linalg import scipy -from torch.func import jacrev from gym.envs.pendulum.pendulum import Pendulum -from learning.utils.logger.TimeKeeper import TimeKeeper class LQRPendulum(Pendulum): - - def dynamics(self, theta, theta_dot, u): - m = self.cfg.asset.mass - b = self.cfg.asset.joint_damping - length = self.cfg.asset.length - g = 9.81 - theta_ddot = (1.0/(m*length**2))*(u-b*theta_dot - m*g*length*torch.sin(theta + (torch.pi/2.0))) - return torch.hstack((theta_dot, theta_ddot)) - def _compute_torques(self): """Compute torques from actions. Actions can be interpreted as position or velocity targets given @@ -31,8 +19,8 @@ def _compute_torques(self): Returns: [torch.Tensor]: Torques sent to the simulation """ - timer = TimeKeeper() - timer.tic("Entered compute_torques()") + # timer = TimeKeeper() + # timer.tic("Entered compute_torques()") actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) @@ -47,72 +35,45 @@ def _compute_torques(self): idx += 1 # vectorized_lqr = torch.vmap(self.lqr, in_dims=0, out_dims=0) - x_d = torch.zeros(self.dof_state.shape, device=self.device) - u_d = torch.zeros(self.torques.shape, device=self.device) + x_desired = torch.zeros(self.dof_state.shape, device=self.device) + u_desired = torch.zeros(self.torques.shape, device=self.device) Q = torch.eye(self.dof_state.shape[1], device=self.device) Q[0, 0] = 10.0 Q = Q.repeat(self.num_envs, 1, 1) - R = torch.eye(self.torques.shape[1], device=self.device).repeat(self.num_envs, 1, 1) + R = torch.eye(self.torques.shape[1], device=self.device).repeat( + self.num_envs, 1, 1 + ) torques = torch.zeros_like(self.torques) - # torques = vectorized_lqr(Q.cpu(), R.cpu(), - # torch.zeros(self.dof_state.shape).cpu(), - # torch.zeros(self.torques.shape).cpu()) for env in range(self.num_envs): - u_prime = self.lqr(Q[env], - R[env], - self.dof_state[env], - x_d[env], - u_d[env]) + u_prime = self.lqr( + Q[env], R[env], self.dof_state[env], x_desired[env], u_desired[env] + ) torques[env] = torch.from_numpy(u_prime) - - # torques = torch.clip(torques, -self.torque_limits, self.torque_limits) - if torch.any(torques != torch.zeros_like(torques)): - print("torques", torques) - timer.toc("Entered compute_torques()") - # print(f"Time spent computing torques {timer.get_time('Entered compute_torques()')}") return torques.view(self.torques.shape) - def lqr(self, Q, R, x, x_d, u_d): - # A, B = self.linearize_dynamics(self.dynamics, x_d, u_d) - A, B = self.linearize_pendulum_dynamics(x_d) + def lqr(self, Q, R, x, x_desired, u_desired): + A, B = self.linearize_pendulum_dynamics(x_desired) A = A.cpu().detach().numpy() B = B.cpu().detach().numpy() Q = Q.cpu().detach().numpy() R = R.cpu().detach().numpy() x = x.cpu().detach().numpy() - x_d = x_d.cpu().detach().numpy() - u_d = u_d.cpu().detach().numpy() - S = scipy.linalg.solve_discrete_are(A, B, Q, R) + x_desired = x_desired.cpu().detach().numpy() + u_desired = u_desired.cpu().detach().numpy() + # S = scipy.linalg.solve_discrete_are(A, B, Q, R) + S = scipy.linalg.solve_continuous_are(A, B, Q, R) B_T = B.transpose(-1, -2) - x_bar = (x - x_d) - u_prime = u_d - linalg.inv(R)@B_T@S@x_bar + x_bar = x - x_desired + u_prime = u_desired - linalg.inv(R) @ B_T @ S @ x_bar return u_prime - def linearize_pendulum_dynamics(self, x_d): + def linearize_pendulum_dynamics(self, x_desired): m = self.cfg.asset.mass b = self.cfg.asset.joint_damping length = self.cfg.asset.length g = 9.81 - A = torch.tensor([[0.0, 1.0], - [(1.0/m*length**2)*(-m*g*length*torch.cos(x_d[0] + torch.pi/2.0)), -(b/m*length**2)]]) - # A = torch.tensor([[0.0, 1.0], - # [(1.0/m*length**2)*(-m*g*length*torch.cos(x_d[0] + torch.pi/2.0)), 0.0]]) - B = torch.tensor([[0.0], - [(1.0/m*length**2)]]) - return A, B + ml2 = m * length**2 - - # ! address zeroing out - # def linearize_dynamics(self, dynamics, x_d, u_d): - # A = torch.vstack(jacrev(dynamics, argnums=(0, 1))(*x_d, u_d)) - # B = jacrev(dynamics, argnums=(len(x_d) + len(u_d) - 1))(*x_d, u_d) - # return A, B - - # def detach_numpy(self, tensor): - # tensor = tensor.detach().cpu() - # # if torch._C._functorch.is_gradtrackingtensor(tensor): - # # tensor = torch._C._functorch.get_unwrapped(tensor) - # # return np.array(tensor.storage().tolist()).reshape(tensor.shape) - # tensor = torch._C._functorch.get_unwrapped(tensor) - # tensor = np.array(tensor.storage().tolist()).reshape(tensor.shape) - # return tensor + A = torch.tensor([[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]]) + B = torch.tensor([[0.0], [(1.0 / ml2)]]) + return A, B diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py index fbdf2597..91a2bd8d 100644 --- a/gym/envs/pendulum/lqr_pendulum_config.py +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -27,14 +27,14 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi/20.0, torch.pi/20.0], + "theta": [-torch.pi / 2.0, torch.pi / 2.0], } - dof_vel_range = {"theta": [-5, 5]} + dof_vel_range = {"theta": [0, 0]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 - desired_sim_frequency = 200 + ctrl_frequency = 500 + desired_sim_frequency = 500 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] diff --git a/scripts/play.py b/scripts/play.py index 4486daae..1a23c22e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -16,9 +16,9 @@ def setup(args): env_cfg.commands.resampling_time = 9999 env_cfg.env.episode_length_s = 9999 env_cfg.env.num_projectiles = 20 + env_cfg.init_state.reset_mode = "reset_to_range" task_registry.make_gym_and_sim() env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) From b951fd7e5f0050471259f86283a5aa9022a76a6f Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:48:17 -0400 Subject: [PATCH 40/69] small fixes --- gym/envs/pendulum/lqr_pendulum.py | 16 ++++++++++------ gym/envs/pendulum/lqr_pendulum_config.py | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py index f6360bd0..a548c7b3 100644 --- a/gym/envs/pendulum/lqr_pendulum.py +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -15,7 +15,9 @@ def dynamics(self, theta, theta_dot, u): b = self.cfg.asset.joint_damping length = self.cfg.asset.length g = 9.81 - theta_ddot = (1.0/(m*length**2))*(u-b*theta_dot - m*g*length*torch.sin(theta + (torch.pi/2.0))) + ml2 = m*length**2 + theta_ddot = (1.0/ml2)*(u-b*theta_dot - m*g*length*torch.sin(theta + (torch.pi/2.0))) + # theta_ddot = (1.0/ml2)*(u-b*theta_dot - m*g*length*torch.sin(theta)) return torch.hstack((theta_dot, theta_ddot)) def _compute_torques(self): @@ -65,7 +67,7 @@ def _compute_torques(self): u_d[env]) torques[env] = torch.from_numpy(u_prime) - # torques = torch.clip(torques, -self.torque_limits, self.torque_limits) + torques = torch.clip(torques, -self.torque_limits, self.torque_limits) if torch.any(torques != torch.zeros_like(torques)): print("torques", torques) timer.toc("Entered compute_torques()") @@ -82,7 +84,8 @@ def lqr(self, Q, R, x, x_d, u_d): x = x.cpu().detach().numpy() x_d = x_d.cpu().detach().numpy() u_d = u_d.cpu().detach().numpy() - S = scipy.linalg.solve_discrete_are(A, B, Q, R) + S = scipy.linalg.solve_continuous_are(A, B, Q, R) + # S = scipy.linalg.solve_discrete_are(A, B, Q, R) B_T = B.transpose(-1, -2) x_bar = (x - x_d) u_prime = u_d - linalg.inv(R)@B_T@S@x_bar @@ -93,12 +96,13 @@ def linearize_pendulum_dynamics(self, x_d): b = self.cfg.asset.joint_damping length = self.cfg.asset.length g = 9.81 + ml2 = m*length**2 A = torch.tensor([[0.0, 1.0], - [(1.0/m*length**2)*(-m*g*length*torch.cos(x_d[0] + torch.pi/2.0)), -(b/m*length**2)]]) + [(1.0/ml2)*(-m*g*length*torch.cos(x_d[0] + (torch.pi/2.0))), -(b/ml2)]]) # A = torch.tensor([[0.0, 1.0], - # [(1.0/m*length**2)*(-m*g*length*torch.cos(x_d[0] + torch.pi/2.0)), 0.0]]) + # [(1.0/ml2)*(-m*g*length*torch.cos(x_d[0])), -(b/ml2)]]) B = torch.tensor([[0.0], - [(1.0/m*length**2)]]) + [(1.0/ml2)]]) return A, B diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py index fbdf2597..cd0b1f20 100644 --- a/gym/envs/pendulum/lqr_pendulum_config.py +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 1.0e-6} # -torch.pi / 2.0} + default_joint_angles = {"theta": 1e-3} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -27,7 +27,7 @@ class init_state(FixedRobotCfg.init_state): # * initial conditions for reset_to_range dof_pos_range = { - "theta": [-torch.pi/20.0, torch.pi/20.0], + "theta": [-1e-3, 1e-3], } dof_vel_range = {"theta": [-5, 5]} From e10c8360a9c17c22b1e88fe5a802ef7cce1eba7a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 26 Apr 2024 16:15:57 -0400 Subject: [PATCH 41/69] burn in normalization before learning --- learning/algorithms/ppo2.py | 2 +- learning/runners/on_policy_runner.py | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..db56ef8c 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -58,7 +58,7 @@ def switch_to_train(self): self.actor.train() self.critic.train() - def act(self, obs, critic_obs): + def act(self, obs): return self.actor.act(obs).detach() def update(self, data, last_obs=None): diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..60dff1e9 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,7 +37,7 @@ def learn(self): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), @@ -50,6 +50,10 @@ def learn(self): device=self.device, ) + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -57,7 +61,7 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -116,6 +120,21 @@ def learn(self): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( From 6eaf961faa364707619f13089971acb83c013108 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:28:21 -0400 Subject: [PATCH 42/69] LQR swing-up with single linearization point --- gym/envs/pendulum/lqr_pendulum.py | 60 ++++++++++++------------ gym/envs/pendulum/lqr_pendulum_config.py | 4 +- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py index fbd18571..1ffe8d7e 100644 --- a/gym/envs/pendulum/lqr_pendulum.py +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -1,11 +1,20 @@ import torch -import numpy.linalg as linalg import scipy from gym.envs.pendulum.pendulum import Pendulum class LQRPendulum(Pendulum): + def _init_buffers(self): + super()._init_buffers() + self.x_desired = torch.tensor([0., 0.], device=self.device) + self.u_desired = torch.zeros(self.torques.shape, device=self.device) + self.A, self.B = self.linearize_pendulum_dynamics(self.x_desired) + self.Q = torch.eye(self.dof_state.shape[1], device=self.device) + self.Q[0, 0] = 10.0 + self.R = torch.eye(self.torques.shape[1], device=self.device) + self.S = torch.from_numpy(self.solve_ricatti(self.Q, self.R)).float().to(self.device) + def _compute_torques(self): """Compute torques from actions. Actions can be interpreted as position or velocity targets given @@ -19,8 +28,6 @@ def _compute_torques(self): Returns: [torch.Tensor]: Torques sent to the simulation """ - # timer = TimeKeeper() - # timer.tic("Entered compute_torques()") actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) @@ -34,37 +41,28 @@ def _compute_torques(self): actuated_dof_vel[:, idx] = self.dof_vel[:, dof_idx] idx += 1 - # vectorized_lqr = torch.vmap(self.lqr, in_dims=0, out_dims=0) - x_desired = torch.zeros(self.dof_state.shape, device=self.device) - u_desired = torch.zeros(self.torques.shape, device=self.device) - Q = torch.eye(self.dof_state.shape[1], device=self.device) - Q[0, 0] = 10.0 - Q = Q.repeat(self.num_envs, 1, 1) - R = torch.eye(self.torques.shape[1], device=self.device).repeat( - self.num_envs, 1, 1 - ) - torques = torch.zeros_like(self.torques) - for env in range(self.num_envs): - u_prime = self.lqr( - Q[env], R[env], self.dof_state[env], x_desired[env], u_desired[env] - ) - torques[env] = torch.from_numpy(u_prime) + torques = self.lqr_u_prime(self.S, self.R, self.dof_state, self.x_desired, self.u_desired) return torques.view(self.torques.shape) - def lqr(self, Q, R, x, x_desired, u_desired): - A, B = self.linearize_pendulum_dynamics(x_desired) - A = A.cpu().detach().numpy() - B = B.cpu().detach().numpy() + def solve_ricatti(self, Q, R): + A = self.A.cpu().detach().numpy() + B = self.B.cpu().detach().numpy() Q = Q.cpu().detach().numpy() R = R.cpu().detach().numpy() - x = x.cpu().detach().numpy() - x_desired = x_desired.cpu().detach().numpy() - u_desired = u_desired.cpu().detach().numpy() - # S = scipy.linalg.solve_discrete_are(A, B, Q, R) S = scipy.linalg.solve_continuous_are(A, B, Q, R) - B_T = B.transpose(-1, -2) - x_bar = x - x_desired - u_prime = u_desired - linalg.inv(R) @ B_T @ S @ x_bar + return S + + def lqr_u_prime(self, S, R, x, x_desired, u_desired): + batch_envs = x.shape[0] + B_T = self.B.transpose(-1, -2).expand(batch_envs, -1, -1) + S = S.expand(batch_envs, -1, -1) + R_inv = torch.linalg.inv(R.expand(batch_envs, -1, -1)) + x_bar = x - x_desired.expand(batch_envs, -1) + # u_prime = u_desired - torch.linalg.inv(R) @ B_T @ S @ x_bar + K = torch.einsum("...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", R_inv, B_T), + S) + u_prime = u_desired - torch.einsum("...ij, ...jk -> ...ik", K, x_bar.unsqueeze(-1)).squeeze(-1) return u_prime def linearize_pendulum_dynamics(self, x_desired): @@ -74,6 +72,6 @@ def linearize_pendulum_dynamics(self, x_desired): g = 9.81 ml2 = m * length**2 - A = torch.tensor([[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]]) - B = torch.tensor([[0.0], [(1.0 / ml2)]]) + A = torch.tensor([[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]], device=self.device) + B = torch.tensor([[0.0], [(1.0 / ml2)]], device=self.device) return A, B diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py index 174fa2be..2cbd6c43 100644 --- a/gym/envs/pendulum/lqr_pendulum_config.py +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -18,7 +18,7 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 1e-3} # -torch.pi / 2.0} + default_joint_angles = {"theta": torch.pi} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position @@ -29,7 +29,7 @@ class init_state(FixedRobotCfg.init_state): dof_pos_range = { "theta": [-torch.pi / 2.0, torch.pi / 2.0], } - dof_vel_range = {"theta": [0, 0]} + dof_vel_range = {"theta": [-5, 5]} class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle From acedd9719defd776d0ff5728926cf31c737c9b0f Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 26 Apr 2024 16:23:41 -0400 Subject: [PATCH 43/69] fix configs wrt to normalization, and fixed_base --- gym/envs/base/fixed_robot_config.py | 27 +++++++++---------- gym/envs/base/legged_robot_config.py | 3 +-- gym/envs/cartpole/cartpole_config.py | 2 +- gym/envs/mini_cheetah/mini_cheetah_config.py | 3 ++- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +-- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 ++ gym/envs/pendulum/pendulum_config.py | 9 ++++--- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 8c7427f8..b5b5e88d 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - # only for 'ActorCriticRecurrent': - # rnn_type = 'lstm' - # rnn_hidden_size = 512 - # rnn_num_layers = 1 - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False class noise: - noise = 0.1 # implement as needed, also in your robot class + observation_a = 0.1 # implement as needed, also in your robot class + + class critic: + hidden_dims = [512, 256, 128] + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] class rewards: class weights: diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7e6cabc6..2dd2083d 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -238,13 +238,12 @@ class actor: hidden_dims = [512, 256, 128] # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] + normalize_obs = True actions = ["q_des"] disable_actions = False diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index f18d5a3b..6e073d5c 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(FixedRobotCfgPPO.policy): + class actor(FixedRobotCfgPPO.actor): init_noise_std = 1.0 num_layers = 2 num_units = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 28c70123..2f9bf540 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,7 +130,6 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - obs = [ "base_lin_vel", "base_ang_vel", @@ -140,6 +139,7 @@ class actor: "dof_vel", "dof_pos_target", ] + normalize_obs = True actions = ["dof_pos_target"] add_noise = True disable_actions = False @@ -168,6 +168,7 @@ class critic: "dof_vel", "dof_pos_target", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 3c92882d..d89715fc 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,7 +73,6 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True obs = [ "base_ang_vel", "projected_gravity", @@ -82,6 +81,7 @@ class actor: "dof_vel", "phase_obs", ] + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -100,7 +100,6 @@ class critic: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True obs = [ "base_height", "base_lin_vel", @@ -112,6 +111,7 @@ class critic: "phase_obs", "dof_pos_target", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 86a8d606..c7e061a9 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -198,6 +198,7 @@ class actor: "dof_vel", "dof_pos_history", ] + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -226,6 +227,7 @@ class critic: "dof_vel", "dof_pos_history", ] + normalize_obs = True class reward: class weights: diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..eed5d0a9 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -68,6 +68,7 @@ class actor: "dof_pos", "dof_vel", ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False @@ -77,14 +78,14 @@ class noise: dof_vel = 0.0 class critic: + hidden_dims = [128, 64, 32] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "tanh" obs = [ "dof_pos", "dof_vel", ] - hidden_dims = [128, 64, 32] - # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" - standard_critic_nn = True + normalize_obs = True class reward: class weights: From d26f02741a570dffb2ea57929883b1c00661ff65 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 26 Apr 2024 19:34:23 -0400 Subject: [PATCH 44/69] WIP LQR data generating runs --- gym/envs/pendulum/lqr_pendulum.py | 1 - gym/envs/pendulum/lqr_pendulum_config.py | 2 +- learning/runners/__init__.py | 3 +- learning/runners/custom_critic_runner.py | 1 - learning/runners/lqr_data_gen_runner.py | 134 +++++++++++++++++++++++ 5 files changed, 136 insertions(+), 5 deletions(-) create mode 100644 learning/runners/lqr_data_gen_runner.py diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py index 1ffe8d7e..f3e3f7c8 100644 --- a/gym/envs/pendulum/lqr_pendulum.py +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -58,7 +58,6 @@ def lqr_u_prime(self, S, R, x, x_desired, u_desired): S = S.expand(batch_envs, -1, -1) R_inv = torch.linalg.inv(R.expand(batch_envs, -1, -1)) x_bar = x - x_desired.expand(batch_envs, -1) - # u_prime = u_desired - torch.linalg.inv(R) @ B_T @ S @ x_bar K = torch.einsum("...ij, ...jk -> ...ik", torch.einsum("...ij, ...jk -> ...ik", R_inv, B_T), S) diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py index 2cbd6c43..e8d27a95 100644 --- a/gym/envs/pendulum/lqr_pendulum_config.py +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -59,7 +59,7 @@ class scaling(FixedRobotCfg.scaling): class LQRPendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "CustomCriticRunner" + runner_class_name = "OnPolicyRunner" #"LQRDataGenRunner" class actor: hidden_dims = [128, 64, 32] diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index e41cf82b..f957b03d 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -32,8 +32,7 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner -# from .old_data_logging_runner import DataLoggingRunner -# from .old_critic_only_runner import CriticOnlyRunner from .old_policy_runner import OldPolicyRunner from .custom_critic_runner import CustomCriticRunner from .critic_only_runner import CriticOnlyRunner +from .lqr_data_gen_runner import LQRDataGenRunner diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py index 8fad45a1..10f98f24 100644 --- a/learning/runners/custom_critic_runner.py +++ b/learning/runners/custom_critic_runner.py @@ -32,7 +32,6 @@ def _set_up_alg(self): critic_class_name = self.critic_cfg["critic_class_name"] num_actions = self.get_action_size(self.actor_cfg["actions"]) actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) - # critic = Critic(num_critic_obs, **self.critic_cfg) critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") alg_class = eval(self.cfg["algorithm_class_name"]) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) diff --git a/learning/runners/lqr_data_gen_runner.py b/learning/runners/lqr_data_gen_runner.py new file mode 100644 index 00000000..751aba5c --- /dev/null +++ b/learning/runners/lqr_data_gen_runner.py @@ -0,0 +1,134 @@ +import os +import time +import torch +import numpy as np +from tensordict import TensorDict + +from gym import LEGGED_GYM_ROOT_DIR +from learning.utils import Logger +from .custom_critic_runner import OnPolicyRunner +from learning.storage import DictStorage + +logger = Logger() +storage = DictStorage() + + +class LQRDataGenRunner(OnPolicyRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + self.total_steps = self.num_learning_iterations*self.num_steps_per_env + self.data_dict = {"observations": np.zeros((self.total_steps, env.num_envs, env.dof_state.shape[0])), + "actions": np.zeros((self.total_steps, env.num_envs, env.num_actuators)), + "total_rewards": np.zeros((self.total_steps, env.num_envs, 1))} + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.save() + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "actions": self.alg.act(actor_obs, critic_obs), + "critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + self.data_dict["observations"][self.it*i, ...] = actor_obs + self.data_dict["actions"][self.it*i, ...] = self.torques + self.data_dict["total_rewards"][self.it*i, ...] = total_rewards + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + time_str = time.strftime("%Y%m%d_%H%M%S") + save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc", "lqr_data", f"lqr_{time_str}.npz") + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + np.savez(save_path, *self.data_dict) + print(f"Saved data from LQR Pendulum run to {save_path}") From 01de0fe3ba5415228101d3cdec6ba4c2ad5a4990 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 29 Apr 2024 15:40:51 -0400 Subject: [PATCH 45/69] - save data each iteration in lqr_data_gen_runner - ruff the code. --- gym/envs/pendulum/lqr_pendulum.py | 27 +++++++---- gym/envs/pendulum/lqr_pendulum_config.py | 4 +- gym/utils/helpers.py | 18 +++++-- learning/algorithms/ppo2_critic_only.py | 6 +-- learning/modules/lqrc/autoencoder.py | 9 +--- learning/modules/lqrc/custom_critics.py | 56 ++++++++++++---------- learning/modules/lqrc/evaluate_critic.py | 10 ++-- learning/modules/lqrc/plotting.py | 12 +++-- learning/modules/lqrc/train_autoencoder.py | 12 ++--- learning/runners/BaseRunner.py | 3 ++ learning/runners/critic_only_runner.py | 10 +--- learning/runners/custom_critic_runner.py | 5 +- learning/runners/lqr_data_gen_runner.py | 43 ++++++++++++----- learning/utils/dict_utils.py | 10 ++++ scripts/train.py | 2 +- 15 files changed, 139 insertions(+), 88 deletions(-) diff --git a/gym/envs/pendulum/lqr_pendulum.py b/gym/envs/pendulum/lqr_pendulum.py index f3e3f7c8..49d99696 100644 --- a/gym/envs/pendulum/lqr_pendulum.py +++ b/gym/envs/pendulum/lqr_pendulum.py @@ -7,13 +7,15 @@ class LQRPendulum(Pendulum): def _init_buffers(self): super()._init_buffers() - self.x_desired = torch.tensor([0., 0.], device=self.device) + self.x_desired = torch.tensor([0.0, 0.0], device=self.device) self.u_desired = torch.zeros(self.torques.shape, device=self.device) self.A, self.B = self.linearize_pendulum_dynamics(self.x_desired) self.Q = torch.eye(self.dof_state.shape[1], device=self.device) self.Q[0, 0] = 10.0 self.R = torch.eye(self.torques.shape[1], device=self.device) - self.S = torch.from_numpy(self.solve_ricatti(self.Q, self.R)).float().to(self.device) + self.S = ( + torch.from_numpy(self.solve_ricatti(self.Q, self.R)).float().to(self.device) + ) def _compute_torques(self): """Compute torques from actions. @@ -41,7 +43,9 @@ def _compute_torques(self): actuated_dof_vel[:, idx] = self.dof_vel[:, dof_idx] idx += 1 - torques = self.lqr_u_prime(self.S, self.R, self.dof_state, self.x_desired, self.u_desired) + torques = self.lqr_u_prime( + self.S, self.R, self.dof_state, self.x_desired, self.u_desired + ) return torques.view(self.torques.shape) def solve_ricatti(self, Q, R): @@ -58,10 +62,14 @@ def lqr_u_prime(self, S, R, x, x_desired, u_desired): S = S.expand(batch_envs, -1, -1) R_inv = torch.linalg.inv(R.expand(batch_envs, -1, -1)) x_bar = x - x_desired.expand(batch_envs, -1) - K = torch.einsum("...ij, ...jk -> ...ik", - torch.einsum("...ij, ...jk -> ...ik", R_inv, B_T), - S) - u_prime = u_desired - torch.einsum("...ij, ...jk -> ...ik", K, x_bar.unsqueeze(-1)).squeeze(-1) + K = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", R_inv, B_T), + S, + ) + u_prime = u_desired - torch.einsum( + "...ij, ...jk -> ...ik", K, x_bar.unsqueeze(-1) + ).squeeze(-1) return u_prime def linearize_pendulum_dynamics(self, x_desired): @@ -71,6 +79,9 @@ def linearize_pendulum_dynamics(self, x_desired): g = 9.81 ml2 = m * length**2 - A = torch.tensor([[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]], device=self.device) + A = torch.tensor( + [[0.0, 1.0], [g / length * torch.cos(x_desired[0]), -b / ml2]], + device=self.device, + ) B = torch.tensor([[0.0], [(1.0 / ml2)]], device=self.device) return A, B diff --git a/gym/envs/pendulum/lqr_pendulum_config.py b/gym/envs/pendulum/lqr_pendulum_config.py index e8d27a95..1c0975f9 100644 --- a/gym/envs/pendulum/lqr_pendulum_config.py +++ b/gym/envs/pendulum/lqr_pendulum_config.py @@ -5,7 +5,7 @@ class LQRPendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**12 + num_envs = 32 # 2**12 num_actuators = 1 # 1 for theta connecting base and pole episode_length_s = 3.0 @@ -59,7 +59,7 @@ class scaling(FixedRobotCfg.scaling): class LQRPendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OnPolicyRunner" #"LQRDataGenRunner" + runner_class_name = "LQRDataGenRunner" # "LQRDataGenRunner" class actor: hidden_dims = [128, 64, 32] diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 7236d169..7a577b1b 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -190,10 +190,22 @@ def update_cfg_from_args(env_cfg, train_cfg, args): train_cfg.runner_class_name = "CriticOnlyRunner" train_cfg.runner.resume = True train_cfg.runner.experiment_name += "_critic_only" - train_cfg.runner.run_name += f"{train_cfg.critic.critic_class_name}_custom_critic_only" if args.task == "pendulum" and args.custom_critic else "standard_critic_only" + train_cfg.runner.run_name += ( + f"{train_cfg.critic.critic_class_name}_custom_critic_only" + if args.task == "pendulum" and args.custom_critic + else "standard_critic_only" + ) else: - train_cfg.runner.experiment_name += "_custom_critic" if args.task == "pendulum" and args.custom_critic else "_standard_critic" - train_cfg.runner.run_name += f"{train_cfg.critic.critic_class_name}_custom_critic" if args.task == "pendulum" and args.custom_critic else "standard_critic" + train_cfg.runner.experiment_name += ( + "_custom_critic" + if args.task == "pendulum" and args.custom_critic + else "_standard_critic" + ) + train_cfg.runner.run_name += ( + f"{train_cfg.critic.critic_class_name}_custom_critic" + if args.task == "pendulum" and args.custom_critic + else "standard_critic" + ) def get_args(custom_parameters=None): diff --git a/learning/algorithms/ppo2_critic_only.py b/learning/algorithms/ppo2_critic_only.py index e2b48920..29068d08 100644 --- a/learning/algorithms/ppo2_critic_only.py +++ b/learning/algorithms/ppo2_critic_only.py @@ -73,7 +73,7 @@ def update(self, data, last_obs=None): ) self.update_critic(data) - self.update_actor(data) #! gradient step is disabled + self.update_actor(data) #! gradient step is disabled def update_critic(self, data): self.mean_value_loss = 0 @@ -117,7 +117,7 @@ def update_actor(self, data): actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std - entropy_batch = self.actor.entropy + # entropy_batch = self.actor.entropy # * KL if self.desired_kl is not None and self.schedule == "adaptive": @@ -154,7 +154,7 @@ def update_actor(self, data): ) surrogate_loss = torch.max(surrogate, surrogate_clipped).mean() - loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() + # loss = surrogate_loss - self.entropy_coef * entropy_batch.mean() # * Gradient step # self.optimizer.zero_grad() diff --git a/learning/modules/lqrc/autoencoder.py b/learning/modules/lqrc/autoencoder.py index 65d5f8cc..c4e24838 100644 --- a/learning/modules/lqrc/autoencoder.py +++ b/learning/modules/lqrc/autoencoder.py @@ -1,17 +1,10 @@ -import torch import torch.nn as nn from learning.modules.utils.neural_net import create_MLP class Autoencoder(nn.Module): - def __init__( - self, - num_inputs, - num_latent, - hidden_dims, - activation="elu" - ): + def __init__(self, num_inputs, num_latent, hidden_dims, activation="elu"): super().__init__() self.encoder = create_MLP(num_inputs, num_latent, hidden_dims, activation) self.decoder = create_MLP(num_latent, num_inputs, hidden_dims, activation) diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py index d29eed7f..1d1e267d 100644 --- a/learning/modules/lqrc/custom_critics.py +++ b/learning/modules/lqrc/custom_critics.py @@ -8,7 +8,6 @@ class CustomCriticBaseline(Critic): - def __init__( self, num_obs, @@ -81,11 +80,11 @@ def forward(self, x): output = self.activation_3(output) C = self.create_cholesky(output) A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) - y_pred = torch.einsum("...ij, ...jk -> ...ik", - torch.einsum("...ij, ...jk -> ...ik", - x.unsqueeze(-1).transpose(-2, -1), - A), - x.unsqueeze(-1)).squeeze(-1) + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ).squeeze(-1) return y_pred def create_cholesky(self, x): @@ -126,7 +125,8 @@ def __init__( device=device, ) self.activation_3.register_forward_hook(self.save_intermediate()) - self.const_penalty = 0.1 # 0.0 if kwargs.get("const_penalty") is None else kwargs.get("const_penalty") + # 0.0 if kwargs.get("const_penalty") is None else kwargs.get("const_penalty") + self.const_penalty = 0.1 def forward(self, x): output = self.connection_1(x) @@ -138,11 +138,11 @@ def forward(self, x): C = self.create_cholesky(output[..., :-1]) A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) c = output[..., -1] - y_pred = torch.einsum("...ij, ...jk -> ...ik", - torch.einsum("...ij, ...jk -> ...ik", - x.unsqueeze(-1).transpose(-2, -1), - A), - x.unsqueeze(-1)) + c.unsqueeze(-1).unsqueeze(-1) + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum("...ij, ...jk -> ...ik", x.unsqueeze(-1).transpose(-2, -1), A), + x.unsqueeze(-1), + ) + c.unsqueeze(-1).unsqueeze(-1) return y_pred.squeeze(-1) def save_intermediate(self): @@ -191,22 +191,24 @@ def forward(self, x): output = self.activation_2(output) output = self.connection_3(output) output = self.activation_3(output) - C = self.create_cholesky(output[..., :self.L_indices]) + C = self.create_cholesky(output[..., : self.L_indices]) A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) - offset = output[..., self.L_indices:] + offset = output[..., self.L_indices :] x_bar = x - offset - y_pred = torch.einsum("...ij, ...jk -> ...ik", - torch.einsum("...ij, ...jk -> ...ik", - x_bar.unsqueeze(-1).transpose(-2, -1), - A), - x_bar.unsqueeze(-1)).squeeze(-1) + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum( + "...ij, ...jk -> ...ik", x_bar.unsqueeze(-1).transpose(-2, -1), A + ), + x_bar.unsqueeze(-1), + ).squeeze(-1) return y_pred def save_intermediate(self): def hook(module, input, output): - C = self.create_cholesky(output[..., :self.L_indices]) + C = self.create_cholesky(output[..., : self.L_indices]) A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) - offset = output[..., self.L_indices:] + offset = output[..., self.L_indices :] self.intermediates["A"] = A self.intermediates["offset"] = offset @@ -238,11 +240,13 @@ def forward(self, x): output = self.activation_3(output) C = self.create_cholesky(output) A = torch.einsum("...ij, ...jk -> ...ik", C, C.transpose(-2, -1)) - y_pred = torch.einsum("...ij, ...jk -> ...ik", - torch.einsum("...ij, ...jk -> ...ik", - xhat.unsqueeze(-1).transpose(-2, -1), - A), - xhat.unsqueeze(-1)).squeeze(-1) + y_pred = torch.einsum( + "...ij, ...jk -> ...ik", + torch.einsum( + "...ij, ...jk -> ...ik", xhat.unsqueeze(-1).transpose(-2, -1), A + ), + xhat.unsqueeze(-1), + ).squeeze(-1) return y_pred def save_intermediate(self): diff --git a/learning/modules/lqrc/evaluate_critic.py b/learning/modules/lqrc/evaluate_critic.py index 32ebead4..645c4c20 100644 --- a/learning/modules/lqrc/evaluate_critic.py +++ b/learning/modules/lqrc/evaluate_critic.py @@ -4,13 +4,13 @@ from utils import critic_eval_args, get_load_path from plotting import ( - plot_custom_critic, plot_critic_prediction_only, ) from learning.modules import Critic from learning.modules.lqrc.custom_critics import ( - CustomCriticBaseline, Cholesky, CholeskyPlusConst, CholeskyOffset1, - CholeskyOffset2 + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2, ) import torch @@ -74,5 +74,5 @@ def model_switch(args): fn = args["fn"] plot_critic_prediction_only( - x, torch.vstack(y_pred), save_path + f"/{fn}.png", contour=args["contour"] - ) + x, torch.vstack(y_pred), save_path + f"/{fn}.png", contour=args["contour"] + ) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 475bf9ca..b26e99bf 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -343,10 +343,14 @@ def plot_autoencoder(targets, predictions, fn): _, ax = plt.subplots() ax.scatter(targets[:, 0], targets[:, 1], label="Targets", alpha=0.5) ax.scatter(predictions[:, 0], predictions[:, 1], label="Predictions", alpha=0.5) - ax.set_xlim(min(np.min(targets[:, 0]), np.min(predictions[:, 0])), - max(np.max(targets[:, 0]), np.max(predictions[:, 0]))) - ax.set_ylim(min(np.min(targets[:, 1]), np.min(predictions[:, 1])), - max(np.max(targets[:, 1]), np.max(predictions[:, 1]))) + ax.set_xlim( + min(np.min(targets[:, 0]), np.min(predictions[:, 0])), + max(np.max(targets[:, 0]), np.max(predictions[:, 0])), + ) + ax.set_ylim( + min(np.min(targets[:, 1]), np.min(predictions[:, 1])), + max(np.max(targets[:, 1]), np.max(predictions[:, 1])), + ) ax.set_xlabel("theta") ax.set_ylabel("theta dot") ax.legend(loc="upper left") diff --git a/learning/modules/lqrc/train_autoencoder.py b/learning/modules/lqrc/train_autoencoder.py index 0b3b7c5f..913c3e91 100644 --- a/learning/modules/lqrc/train_autoencoder.py +++ b/learning/modules/lqrc/train_autoencoder.py @@ -9,7 +9,7 @@ def generate_data(lb, ub, n): - return (ub - lb).expand(n, -1)*torch.rand(n, ub.shape[-1]) + lb + return (ub - lb).expand(n, -1) * torch.rand(n, ub.shape[-1]) + lb if __name__ == "__main__": @@ -50,10 +50,10 @@ def generate_data(lb, ub, n): targets = generate_data(lb, ub, 100) predictions = torch.zeros_like(targets) for ix, target in enumerate(targets): - prediction = model(target) - predictions[ix, :] = prediction - loss = nn.MSELoss(reduction="mean")(target, prediction) - test_loss.append(loss) + prediction = model(target) + predictions[ix, :] = prediction + loss = nn.MSELoss(reduction="mean")(target, prediction) + test_loss.append(loss) print(f"Average loss on test set {torch.mean(torch.tensor(test_loss))}") @@ -64,7 +64,7 @@ def generate_data(lb, ub, n): os.makedirs(save_path) plot_loss(training_loss, f"{save_path}/{save_str}_loss") - plot_autoencoder(targets, predictions, f"{save_path}/{save_str}_predictions") + plot_autoencoder(targets, predictions, f"{save_path}/{save_str}_predictions") if args["save_model"]: torch.save( diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9e5ab2e3..cac170dc 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -31,6 +31,9 @@ def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + remove_zero_weighted_rewards( + train_cfg["critic"]["reward"]["termination_weight"] + ) self.actor_cfg = train_cfg["actor"] self.critic_cfg = train_cfg["critic"] diff --git a/learning/runners/critic_only_runner.py b/learning/runners/critic_only_runner.py index b37d90e9..d5a9eb4a 100644 --- a/learning/runners/critic_only_runner.py +++ b/learning/runners/critic_only_runner.py @@ -5,13 +5,7 @@ from gym import LEGGED_GYM_ROOT_DIR from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic -from learning.modules.lqrc import ( - Cholesky, - CholeskyPlusConst, - CholeskyOffset1, - CholeskyOffset2 -) +from learning.modules import Actor from learning.utils import Logger from .on_policy_runner import OnPolicyRunner @@ -34,7 +28,7 @@ def __init__(self, env, train_cfg, device="cpu"): def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) - num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 critic_class_name = self.critic_cfg["critic_class_name"] num_actions = self.get_action_size(self.actor_cfg["actions"]) actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) diff --git a/learning/runners/custom_critic_runner.py b/learning/runners/custom_critic_runner.py index 10f98f24..d2621b78 100644 --- a/learning/runners/custom_critic_runner.py +++ b/learning/runners/custom_critic_runner.py @@ -5,8 +5,7 @@ from gym import LEGGED_GYM_ROOT_DIR from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic -from learning.modules.lqrc import Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 +from learning.modules import Actor from learning.utils import Logger from .on_policy_runner import OnPolicyRunner @@ -28,7 +27,7 @@ def __init__(self, env, train_cfg, device="cpu"): def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) - num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) # noqa: F841 critic_class_name = self.critic_cfg["critic_class_name"] num_actions = self.get_action_size(self.actor_cfg["actions"]) actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) diff --git a/learning/runners/lqr_data_gen_runner.py b/learning/runners/lqr_data_gen_runner.py index 751aba5c..5aafa608 100644 --- a/learning/runners/lqr_data_gen_runner.py +++ b/learning/runners/lqr_data_gen_runner.py @@ -1,13 +1,13 @@ import os import time import torch -import numpy as np from tensordict import TensorDict from gym import LEGGED_GYM_ROOT_DIR from learning.utils import Logger from .custom_critic_runner import OnPolicyRunner from learning.storage import DictStorage +from learning.utils import export_to_numpy logger = Logger() storage = DictStorage() @@ -22,10 +22,7 @@ def __init__(self, env, train_cfg, device="cpu"): self.cfg["max_iterations"], self.device, ) - self.total_steps = self.num_learning_iterations*self.num_steps_per_env - self.data_dict = {"observations": np.zeros((self.total_steps, env.num_envs, env.dof_state.shape[0])), - "actions": np.zeros((self.total_steps, env.num_envs, env.num_actuators)), - "total_rewards": np.zeros((self.total_steps, env.num_envs, 1))} + self.total_steps = self.num_learning_iterations * self.num_steps_per_env def learn(self): self.set_up_logger() @@ -57,6 +54,9 @@ def learn(self): ) logger.tic("runtime") + + dict_save_path = self.create_data_save_path_and_name() + for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") @@ -105,13 +105,12 @@ def learn(self): logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(dones) - self.data_dict["observations"][self.it*i, ...] = actor_obs - self.data_dict["actions"][self.it*i, ...] = self.torques - self.data_dict["total_rewards"][self.it*i, ...] = total_rewards logger.toc("collection") logger.tic("learning") self.alg.update(storage.data) + torch.save(storage.data.cpu(), dict_save_path + str(self.it) + ".pt") + export_to_numpy(storage.data, dict_save_path + str(self.it)) storage.clear() logger.toc("learning") logger.log_all_categories() @@ -125,10 +124,32 @@ def learn(self): self.save() self.save() + print(f"Saved data from LQR Pendulum run to {dict_save_path}") + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def create_data_save_path_and_name(self): time_str = time.strftime("%Y%m%d_%H%M%S") - save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "lqrc", "lqr_data", f"lqr_{time_str}.npz") + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "lqr_data", + f"run_{time_str}", + "iteration_", + ) dir_path = os.path.dirname(save_path) if not os.path.exists(dir_path): os.makedirs(dir_path) - np.savez(save_path, *self.data_dict) - print(f"Saved data from LQR Pendulum run to {save_path}") + return save_path diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 2d19e072..aad737f4 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -1,3 +1,4 @@ +import numpy as np import torch from tensordict import TensorDict @@ -71,3 +72,12 @@ def create_uniform_generator(data, batch_size, num_epochs, keys=None): indices[i * batch_size : (i + 1) * batch_size] ] yield batched_data + + +@torch.no_grad +def export_to_numpy(data, path): + # check if path ends iwth ".npz", and if not append it. + if not path.endswith(".npz"): + path += ".npz" + np.savez_compressed(path, **{key: val.cpu().numpy() for key, val in data.items()}) + return diff --git a/scripts/train.py b/scripts/train.py index 7e621dee..176fffb3 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,5 +1,5 @@ from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils import get_args, task_registry from gym.utils.logging_and_saving import wandb_singleton from gym.utils.logging_and_saving import local_code_save_helper From 8ad4eb0beedfa9a0e1a88611d1fef8f9c0f76182 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 1 May 2024 10:54:57 -0400 Subject: [PATCH 46/69] save data every step for offline use --- gym/envs/base/fixed_robot.py | 4 ++-- gym/envs/pendulum/pendulum.py | 17 +++++++++++++++ gym/envs/pendulum/pendulum_config.py | 12 +++++------ learning/runners/on_policy_runner.py | 31 ++++++++++++++++++++++++---- 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index a7c1b7ae..b8f47ceb 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -417,8 +417,8 @@ def _init_buffers(self): self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) # * check that init range highs and lows are consistent # * and repopulate to match - if self.cfg.init_state.reset_mode == "reset_to_range": - self.initialize_ranges_for_initial_conditions() + # if self.cfg.init_state.reset_mode == "reset_to_range": + self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): self.dof_pos_range = torch.zeros( diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 18f3d6c9..aa2ae8af 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -8,6 +8,23 @@ def _post_physics_step(self): """Update all states that are not handled in PhysX""" super()._post_physics_step() + def reset_to_uniform(self, env_ids): + grid_points = self.num_envs // 2 + grid_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, + ).unsqueeze(1) + grid_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ).unsqueeze(1) + self.dof_pos[env_ids] = grid_pos[env_ids % grid_points] + self.dof_vel[env_ids] = grid_vel[env_ids % grid_points] + def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] return self._sqrdexp(theta_rwd.squeeze(dim=-1)) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 2c974232..dc31fafe 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -23,7 +23,7 @@ class init_state(FixedRobotCfg.init_state): # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_uniform" # * initial conditions for reset_to_range dof_pos_range = { @@ -33,8 +33,8 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 - desired_sim_frequency = 200 + ctrl_frequency = 10 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -59,7 +59,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "CustomCriticRunner" + runner_class_name = "OnPolicyRunner" class actor: hidden_dims = [128, 64, 32] @@ -121,6 +121,6 @@ class algorithm(FixedRobotCfgPPO.algorithm): class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates + max_iterations = 200 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 50 diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..6b631cb3 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -1,11 +1,14 @@ import os import torch +import time +from gym import LEGGED_GYM_ROOT_DIR from tensordict import TensorDict from learning.utils import Logger from .BaseRunner import BaseRunner from learning.storage import DictStorage +from learning.utils import export_to_numpy logger = Logger() storage = DictStorage() @@ -30,7 +33,7 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() + # self.save() # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) @@ -102,6 +105,7 @@ def learn(self): logger.toc("collection") logger.tic("learning") + self.save() self.alg.update(storage.data) storage.clear() logger.toc("learning") @@ -112,9 +116,10 @@ def learn(self): logger.toc("runtime") logger.print_to_terminal() - if self.it % self.save_interval == 0: - self.save() - self.save() + # if self.it % self.save_interval == 0: + # self.save() + # self.save() + # print(f"Saved data from LQR Pendulum run to {dict_save_path}") def update_rewards(self, rewards_dict, terminated): rewards_dict.update( @@ -156,6 +161,9 @@ def save(self): }, path, ) + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) @@ -178,3 +186,18 @@ def get_inference_actions(self): def export(self, path): self.alg.actor.export(path) + + def create_data_save_path_and_name(self): + time_str = time.strftime("%Y%m%d_%H%M%S") + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "logs", + "lqrc", + "lqr_data", + f"run_{time_str}", + "iteration_", + ) + dir_path = os.path.dirname(save_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + return save_path From 45685889f4ab98e477b779aca02caeb0f1837bb8 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 1 May 2024 14:13:17 -0400 Subject: [PATCH 47/69] refactor GAE computation to be explicit --- learning/algorithms/ppo2.py | 5 ++++- learning/utils/dict_utils.py | 34 +++++++++++++--------------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index db56ef8c..6fd2e798 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -67,9 +67,12 @@ def update(self, data, last_obs=None): else: with torch.no_grad(): last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( + + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( data, self.gamma, self.lam, self.critic, last_values ) + data["returns"] = data["advantages"] + data["values"] self.update_critic(data) self.update_actor(data) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index aad737f4..4dc8ab43 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -10,30 +10,28 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): else: last_values = critic.evaluate(data["critic_obs"][-1]) - data.update({"returns": torch.zeros_like(data["rewards"])}) - data["returns"][-1] = last_values * ~data["dones"][-1] + returns = torch.zeros_like(data["rewards"]) + returns[-1] = last_values * ~data["dones"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] - data["returns"][k] = ( - data["rewards"][k] + gamma * data["returns"][k + 1] * not_done - ) - data["returns"] = (data["returns"] - data["returns"].mean()) / ( - data["returns"].std() + 1e-8 - ) - return + returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done + + return normalize(returns) @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - data.update({"values": critic.evaluate(data["critic_obs"])}) +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) - data.update({"advantages": torch.zeros_like(data["values"])}) +@torch.no_grad +def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] - data["advantages"][-1] = ( + advantages[-1] = ( data["rewards"][-1] + gamma * data["values"][-1] * data["timed_out"][-1] + gamma * last_values * not_done @@ -48,15 +46,9 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + gamma * data["values"][k + 1] * not_done - data["values"][k] ) - data["advantages"][k] = ( - td_error + gamma * lam * not_done * data["advantages"][k + 1] - ) - - data["returns"] = data["advantages"] + data["values"] + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( - data["advantages"].std() + 1e-8 - ) + return normalize(advantages) # todo change num_epochs to num_batches From e781f0241f28d86e0943f561a685dbb2dabd212b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 1 May 2024 14:21:27 -0400 Subject: [PATCH 48/69] store next_obs explicitly, use for last_values --- learning/algorithms/ppo2.py | 15 +++------------ learning/runners/on_policy_runner.py | 4 ++++ learning/utils/dict_utils.py | 11 +++++++++-- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 6fd2e798..99e3a742 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -61,16 +61,10 @@ def switch_to_train(self): def act(self, obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - + def update(self, data): data["values"] = self.critic.evaluate(data["critic_obs"]) data["advantages"] = compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values + data, self.gamma, self.lam, self.critic ) data["returns"] = data["advantages"] + data["values"] @@ -161,10 +155,7 @@ def update_actor(self, data): # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), - self.max_grad_norm, - ) + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += surrogate_loss.item() counter += 1 diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 60dff1e9..c1758000 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,8 +37,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -93,6 +95,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 4dc8ab43..774b77cb 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -25,7 +25,8 @@ def normalize(input, eps=1e-8): @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this @@ -53,10 +54,16 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): # todo change num_epochs to num_batches @torch.no_grad -def create_uniform_generator(data, batch_size, num_epochs, keys=None): +def create_uniform_generator( + data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None +): n, m = data.shape total_data = n * m num_batches_per_epoch = total_data // batch_size + if max_gradient_steps: + num_epochs = max_gradient_steps // num_batches_per_epoch + num_epochs = max(num_epochs, 1) + for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): From 4b8cabbd7f836146dbe381b42edf703049234164 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 26 Mar 2024 18:43:58 -0400 Subject: [PATCH 49/69] refactor critic loss function to consume obs and self-evaluate --- learning/algorithms/ppo2.py | 4 +--- learning/modules/critic.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 99e3a742..e27a6db7 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -24,7 +24,6 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, - loss_fn="MSE", device="cpu", **kwargs, ): @@ -80,8 +79,7 @@ def update_critic(self, data): batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) - value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 744de477..35c34843 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -27,5 +27,5 @@ def evaluate(self, critic_observations): critic_observations = self.obs_rms(critic_observations) return self.NN(critic_observations).squeeze() - def loss_fn(self, input, target): - return nn.functional.mse_loss(input, target, reduction="mean") + def loss_fn(self, obs, target): + return nn.functional.mse_loss(self.evaluate(obs), target, reduction="mean") From 1d6ff39fdae8d749dea9d5511eb04d2ad960a0b6 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 1 May 2024 14:27:17 -0400 Subject: [PATCH 50/69] normalize advantages only after computing returns --- gym/envs/pendulum/pendulum_config.py | 26 ++++++++++++++------------ learning/algorithms/ppo2.py | 3 ++- learning/utils/dict_utils.py | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index d86e4fa5..419d3437 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -102,22 +102,24 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + discount_horizon = 2.0 + lam = 0.98 + # shared + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**16 # new clip_param = 0.2 + learning_rate = 1.0e-4 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor entropy_coef = 0.01 - num_learning_epochs = 6 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-3 schedule = "fixed" # could be adaptive, fixed - discount_horizon = 2.0 # [s] - lam = 0.98 - # GAE_bootstrap_horizon = .0 # [s] desired_kl = 0.01 - max_grad_norm = 1.0 - plus_c_penalty = 0.1 class runner(FixedRobotCfgPPO.runner): run_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index e27a6db7..a025c8ac 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -5,6 +5,7 @@ from learning.utils import ( create_uniform_generator, compute_generalized_advantages, + normalize, ) @@ -66,8 +67,8 @@ def update(self, data): data, self.gamma, self.lam, self.critic ) data["returns"] = data["advantages"] + data["values"] - self.update_critic(data) + data["advantages"] = normalize(data["advantages"]) self.update_actor(data) def update_critic(self, data): diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 774b77cb..e7e567dd 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -49,7 +49,7 @@ def compute_generalized_advantages(data, gamma, lam, critic): ) advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - return normalize(advantages) + return advantages # todo change num_epochs to num_batches From 2143670c637e28448ac39f35095652e6f44803cd Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 1 May 2024 14:39:23 -0400 Subject: [PATCH 51/69] WIP: set up datalogger, and offline training --- gym/envs/pendulum/pendulum_config.py | 2 +- learning/runners/__init__.py | 1 + learning/runners/datalogging_runner.py | 209 +++++++++++++++++++++++++ scripts/offline_critics.py | 55 +++++++ 4 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 learning/runners/datalogging_runner.py create mode 100644 scripts/offline_critics.py diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 419d3437..a45f21cc 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -59,7 +59,7 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "DataLoggingRunner" class actor: hidden_dims = [128, 64, 32] diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index f957b03d..da07bb47 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -36,3 +36,4 @@ from .custom_critic_runner import CustomCriticRunner from .critic_only_runner import CriticOnlyRunner from .lqr_data_gen_runner import LQRDataGenRunner +from .datalogging_runner import DataLoggingRunner diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py new file mode 100644 index 00000000..06b680cd --- /dev/null +++ b/learning/runners/datalogging_runner.py @@ -0,0 +1,209 @@ +import os +import torch +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.storage import DictStorage +from learning.utils import export_to_numpy + +logger = Logger() +storage = DictStorage() + + +class DataLoggingRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + + # * start up storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + + self.save() + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + storage.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.save() + self.alg.update(storage.data) + storage.clear() + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + # if self.it % self.save_interval == 0: + # self.save() + # self.save() + # print(f"Saved data from LQR Pendulum run to {dict_save_path}") + + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") + export_to_numpy(storage.data, path_data + ".npz") + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py new file mode 100644 index 00000000..d91e0018 --- /dev/null +++ b/scripts/offline_critics.py @@ -0,0 +1,55 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry +from learning.modules import Actor, Critic +from learning.modules.lqrc import Cholesky +from gym.utils.helpers import class_to_dict +from learning.utils import ( + compute_generalized_advantages, +) + +from gym import LEGGED_GYM_ROOT_DIR +import os +import torch + +# handle some bookkeeping +args = get_args() +env_cfg, train_cfg = task_registry.create_cfgs(args) +env_cfg = class_to_dict(env_cfg) +train_cfg = class_to_dict(train_cfg) +# +run_name = "May01_10-47-07_lam09standard_critic" +log_dir = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", train_cfg["runner"]["experiment_name"], run_name +) + +# create critic based on saved policy +iteration = 1 +loaded_dict = torch.load(os.path.join(log_dir, "model_{}.pt".format(iteration))) +trained_actor = Actor(num_obs=2, num_actions=1, **train_cfg["actor"]) +trained_actor.load_state_dict(loaded_dict["actor_state_dict"]) +trained_critic = Critic(num_obs=2, **train_cfg["critic"]) +trained_critic.load_state_dict(loaded_dict["critic_state_dict"]) + +# create fresh critic + +test_critic = Cholesky(num_obs=2, **train_cfg["critic"]) + +# load data + +data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))) +gamma = train_cfg["algorithm"]["gamma"] +lam = train_cfg["algorithm"]["lam"] + +# compute ground-truth + +# train new critic + +data["values"] = test_critic.evaluate(data["critic_obs"]) +data["advantages"] = compute_generalized_advantages(data, gamma, lam, test_critic) +data["returns"] = data["advantages"] + data["values"] + +# plug in supervised learning here, based off of PPO2.update + +# compare new and old critics + +end = True From 2ddc584254e104c068ab36c20c91bdb72b602955 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 2 May 2024 08:58:34 -0400 Subject: [PATCH 52/69] cherry-picked refactored data generator --- gym/envs/base/legged_robot_config.py | 27 ++++++++++------- gym/envs/mini_cheetah/mini_cheetah_config.py | 15 +--------- .../mini_cheetah/mini_cheetah_ref_config.py | 20 ++----------- learning/algorithms/ppo2.py | 29 +++++++++---------- learning/utils/dict_utils.py | 5 +++- 5 files changed, 37 insertions(+), 59 deletions(-) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 6a868d22..b6667502 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -293,25 +293,30 @@ class termination_weight: termination = 0.01 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_grad_steps = 10 + # new + storage_size = 2**17 # new + mini_batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" - num_steps_per_env = 24 + num_steps_per_env = 24 # deprecate max_iterations = 1500 save_interval = 50 run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 560ac573..447d4d3b 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -190,20 +190,7 @@ class termination_weight: termination = 0.01 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed - discount_horizon = 1.0 # [s] - # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(LeggedRobotRunnerCfg.runner): run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 9d59501a..eedbd904 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -137,27 +137,11 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True # deprecate for PPO2 - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 6 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 4 - storage_size = 2**17 # new - mini_batch_size = 2**15 # new - learning_rate = 5.0e-5 - schedule = "adaptive" # can be adaptive, fixed - discount_horizon = 1.0 # [s] - lam = 0.95 - GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 - max_grad_norm = 1.0 + pass class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 + num_steps_per_env = 32 # deprecate diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index a025c8ac..f03d0e29 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -14,8 +14,8 @@ def __init__( self, actor, critic, - num_learning_epochs=1, - num_mini_batches=1, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -42,8 +42,8 @@ def __init__( # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam @@ -75,10 +75,11 @@ def update_critic(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) self.critic_optimizer.zero_grad() @@ -90,8 +91,6 @@ def update_critic(self, data): self.mean_value_loss /= counter def update_actor(self, data): - # already done before - # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) self.mean_surrogate_loss = 0 counter = 0 @@ -102,12 +101,12 @@ def update_actor(self, data): data["actions"] ).detach() - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - # ! refactor how this is done self.actor.act(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index e7e567dd..4f59c069 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -59,11 +59,14 @@ def create_uniform_generator( ): n, m = data.shape total_data = n * m + + if batch_size > total_data: + batch_size = total_data + num_batches_per_epoch = total_data // batch_size if max_gradient_steps: num_epochs = max_gradient_steps // num_batches_per_epoch num_epochs = max(num_epochs, 1) - for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): From 9603a7b902896c2fa3e0f9a797897f76d1687203 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 2 May 2024 18:13:41 -0400 Subject: [PATCH 53/69] WIP: offline_critics runs --- scripts/offline_critics.py | 43 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index d91e0018..ccd41cc8 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -1,8 +1,4 @@ -from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry -from learning.modules import Actor, Critic from learning.modules.lqrc import Cholesky -from gym.utils.helpers import class_to_dict from learning.utils import ( compute_generalized_advantages, ) @@ -11,34 +7,39 @@ import os import torch +DEVICE = "cuda:0" # handle some bookkeeping -args = get_args() -env_cfg, train_cfg = task_registry.create_cfgs(args) -env_cfg = class_to_dict(env_cfg) -train_cfg = class_to_dict(train_cfg) # -run_name = "May01_10-47-07_lam09standard_critic" +run_name = "May02_08-59-46_standard_critic" log_dir = os.path.join( - LEGGED_GYM_ROOT_DIR, "logs", train_cfg["runner"]["experiment_name"], run_name + LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name ) # create critic based on saved policy -iteration = 1 -loaded_dict = torch.load(os.path.join(log_dir, "model_{}.pt".format(iteration))) -trained_actor = Actor(num_obs=2, num_actions=1, **train_cfg["actor"]) -trained_actor.load_state_dict(loaded_dict["actor_state_dict"]) -trained_critic = Critic(num_obs=2, **train_cfg["critic"]) -trained_critic.load_state_dict(loaded_dict["critic_state_dict"]) +# iteration = 1 +# loaded_dict = torch.load(os.path.join(log_dir, "model_{}.pt".format(iteration))) +# trained_actor = Actor(num_obs=2, num_actions=1, **train_cfg["actor"]) +# trained_actor.load_state_dict(loaded_dict["actor_state_dict"]) +# trained_critic = Critic(num_obs=2, **train_cfg["critic"]) +# trained_critic.load_state_dict(loaded_dict["critic_state_dict"]) # create fresh critic -test_critic = Cholesky(num_obs=2, **train_cfg["critic"]) +test_critic_params = { + "num_obs": 2, + "hidden_dims": None, + "activation": "elu", + "normalize_obs": False, + "output_size": 1, + "device": DEVICE, +} +test_critic = Cholesky(**test_critic_params).to(DEVICE) # load data - -data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))) -gamma = train_cfg["algorithm"]["gamma"] -lam = train_cfg["algorithm"]["lam"] +iteration = 1 +data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) +gamma = 0.99 +lam = 0.99 # compute ground-truth From 037cc1c630b979152b01f2caf6524e0e134ec7d9 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 2 May 2024 18:27:08 -0400 Subject: [PATCH 54/69] trains new critic --- scripts/offline_critics.py | 40 +++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index ccd41cc8..404a2c47 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -1,11 +1,13 @@ from learning.modules.lqrc import Cholesky from learning.utils import ( compute_generalized_advantages, + compute_MC_returns, + create_uniform_generator, ) - from gym import LEGGED_GYM_ROOT_DIR import os import torch +from torch import nn DEVICE = "cuda:0" # handle some bookkeeping @@ -15,14 +17,6 @@ LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name ) -# create critic based on saved policy -# iteration = 1 -# loaded_dict = torch.load(os.path.join(log_dir, "model_{}.pt".format(iteration))) -# trained_actor = Actor(num_obs=2, num_actions=1, **train_cfg["actor"]) -# trained_actor.load_state_dict(loaded_dict["actor_state_dict"]) -# trained_critic = Critic(num_obs=2, **train_cfg["critic"]) -# trained_critic.load_state_dict(loaded_dict["critic_state_dict"]) - # create fresh critic test_critic_params = { @@ -33,8 +27,9 @@ "output_size": 1, "device": DEVICE, } +learning_rate = 1.0e-4 test_critic = Cholesky(**test_critic_params).to(DEVICE) - +critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) # load data iteration = 1 data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) @@ -43,14 +38,33 @@ # compute ground-truth +episode_rollouts = compute_MC_returns(data, gamma) + # train new critic data["values"] = test_critic.evaluate(data["critic_obs"]) data["advantages"] = compute_generalized_advantages(data, gamma, lam, test_critic) data["returns"] = data["advantages"] + data["values"] -# plug in supervised learning here, based off of PPO2.update +mean_value_loss = 0 +counter = 0 +max_gradient_steps = 24 +max_grad_norm = 1.0 +batch_size = 2**16 +generator = create_uniform_generator( + data, + batch_size, + max_gradient_steps=max_gradient_steps, +) +for batch in generator: + value_loss = test_critic.loss_fn(batch["critic_obs"], batch["returns"]) + critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) + critic_optimizer.step() + mean_value_loss += value_loss.item() + print("Value loss: ", value_loss.item()) + counter += 1 +mean_value_loss /= counter # compare new and old critics - -end = True From 98731f1a1899fe6f0cad7eb874a91a14889359db Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Thu, 2 May 2024 22:15:50 -0400 Subject: [PATCH 55/69] WIP plotting func added for critics --- learning/modules/lqrc/plotting.py | 33 ++++++++++++++++++++++++++++++- scripts/offline_critics.py | 13 +++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index b26e99bf..87c27428 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -2,7 +2,7 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np -from matplotlib.colors import CenteredNorm +from matplotlib.colors import TwoSlopeNorm matplotlib.rcParams["pdf.fonttype"] = 42 matplotlib.rcParams["ps.fonttype"] = 42 @@ -13,6 +13,37 @@ matplotlib.rc("font", **font) +def plot_pendulum_critic_predictions( + x, + predictions, + targets, + title, + fn, + colorbar_label="f(x)", + vmin=None, + vmax=None +): + x = x.detach().cpu().numpy() + predictions = predictions.detach().cpu().numpy() + targets = targets.detach().cpu().numpy() + vmin = np.min(predictions) if vmin is None else vmin + vmax = np.max(predictions) if vmax is None else vmax + + fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(6, 6)) + scatter = axis.scatter( + x[:, 0], + x[:, 1], + c=predictions - targets, + cmap="bwr", + marker="o", + alpha=0.5, + norm=TwoSlopeNorm(vcenter=0.0, vmin=vmin, vmax=vmax), + ) + fig.colorbar(scatter, ax=axis, shrink=0.95, label=colorbar_label) + axis.set_title(title) + plt.savefig(f"{fn}_value_error_colormap.png") + + def graph_3D_helper(ax, contour=False): if contour: return ax.contourf diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index 404a2c47..ef173448 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -4,6 +4,7 @@ compute_MC_returns, create_uniform_generator, ) +from learning.modules.lqrc.plotting import plot_pendulum_critic_predictions from gym import LEGGED_GYM_ROOT_DIR import os import torch @@ -28,7 +29,8 @@ "device": DEVICE, } learning_rate = 1.0e-4 -test_critic = Cholesky(**test_critic_params).to(DEVICE) +critic_name = "Cholesky" +test_critic = eval(f"{critic_name}(**test_critic_params).to(DEVICE)") critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) # load data iteration = 1 @@ -68,3 +70,12 @@ mean_value_loss /= counter # compare new and old critics +vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) # thinking of swtiching this to a list comprehension when we have multiple critics +vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) +plot_pendulum_critic_predictions(x=batch, + predictions=data["values"], + targets=data["returns"], + title=f"{critic_name}_it{iteration}", + fn=f"{critic_name}_it{iteration}", + vmin=vmin, + vmax=vmax) From 9190f40ffb3ca298bd630ca79e2ba3b7d21ce100 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Thu, 2 May 2024 22:16:15 -0400 Subject: [PATCH 56/69] critic specific plotting no grad --- learning/modules/lqrc/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 87c27428..ae2a5c42 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -41,7 +41,7 @@ def plot_pendulum_critic_predictions( ) fig.colorbar(scatter, ax=axis, shrink=0.95, label=colorbar_label) axis.set_title(title) - plt.savefig(f"{fn}_value_error_colormap.png") + plt.savefig(f"{fn}.png") def graph_3D_helper(ax, contour=False): From fff4ade99beffa41963bd3b432576a33ce36d5ce Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 3 May 2024 17:39:11 -0400 Subject: [PATCH 57/69] fixed grid to ensure full coverage --- gym/envs/pendulum/pendulum.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index f7897257..317c7ed0 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,3 +1,4 @@ +from math import sqrt import torch from gym.envs.base.fixed_robot import FixedRobot @@ -9,21 +10,22 @@ def _post_physics_step(self): super()._post_physics_step() def reset_to_uniform(self, env_ids): - grid_points = self.num_envs // 2 - grid_pos = torch.linspace( + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( self.dof_pos_range[0, 0], self.dof_pos_range[0, 1], grid_points, device=self.device, - ).unsqueeze(1) - grid_vel = torch.linspace( + ) + lin_vel = torch.linspace( self.dof_vel_range[0, 0], self.dof_vel_range[0, 1], grid_points, device=self.device, - ).unsqueeze(1) - self.dof_pos[env_ids] = grid_pos[env_ids % grid_points] - self.dof_vel[env_ids] = grid_vel[env_ids % grid_points] + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) def _reward_theta(self): theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] From f9499019fb3075ab418f4199efe87222976dbdbd Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 3 May 2024 17:39:29 -0400 Subject: [PATCH 58/69] remove normalization from MC returns --- learning/utils/dict_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 4f59c069..e22dde01 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -16,7 +16,7 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): not_done = ~data["dones"][k] returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done - return normalize(returns) + return returns @torch.no_grad From 193d66e63f231a85a7a9922d4315a30995063264 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 3 May 2024 17:39:41 -0400 Subject: [PATCH 59/69] plotting for single critic finished --- learning/modules/lqrc/plotting.py | 48 ++++++++++++++----------------- scripts/offline_critics.py | 28 +++++++++++------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index ae2a5c42..87b685f4 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -2,7 +2,7 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np -from matplotlib.colors import TwoSlopeNorm +from matplotlib.colors import CenteredNorm matplotlib.rcParams["pdf.fonttype"] = 42 matplotlib.rcParams["ps.fonttype"] = 42 @@ -13,35 +13,29 @@ matplotlib.rc("font", **font) -def plot_pendulum_critic_predictions( - x, - predictions, - targets, - title, - fn, - colorbar_label="f(x)", - vmin=None, - vmax=None +def plot_pendulum_single_critic_predictions( + x, predictions, targets, title, fn, colorbar_label="f(x)" ): - x = x.detach().cpu().numpy() - predictions = predictions.detach().cpu().numpy() - targets = targets.detach().cpu().numpy() - vmin = np.min(predictions) if vmin is None else vmin - vmax = np.max(predictions) if vmax is None else vmax - - fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(6, 6)) - scatter = axis.scatter( - x[:, 0], - x[:, 1], - c=predictions - targets, - cmap="bwr", - marker="o", - alpha=0.5, - norm=TwoSlopeNorm(vcenter=0.0, vmin=vmin, vmax=vmax), + x = x.detach().cpu().numpy().reshape(-1, 2) + predictions = predictions.detach().cpu().numpy().reshape(-1) + targets = targets.detach().cpu().numpy().reshape(-1) + error = predictions - targets + + fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6)) + # error + scatter = axes[0].scatter( + x[:, 0], x[:, 1], c=error, cmap="bwr", alpha=0.5, norm=CenteredNorm() + ) + # prediction values + scatter = axes[1].scatter( + x[:, 0], x[:, 1], c=predictions, cmap="bwr", alpha=0.5, norm=CenteredNorm() ) - fig.colorbar(scatter, ax=axis, shrink=0.95, label=colorbar_label) - axis.set_title(title) + axes[0].set_title("Error") + axes[1].set_title("Predictions") + fig.colorbar(scatter, ax=axes.ravel().tolist(), shrink=0.95, label=colorbar_label) + fig.suptitle(title, fontsize=16) plt.savefig(f"{fn}.png") + print(f"Saved to {fn}.png") def graph_3D_helper(ax, contour=False): diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index ef173448..865cca2e 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -1,10 +1,11 @@ +import time from learning.modules.lqrc import Cholesky from learning.utils import ( compute_generalized_advantages, compute_MC_returns, create_uniform_generator, ) -from learning.modules.lqrc.plotting import plot_pendulum_critic_predictions +from learning.modules.lqrc.plotting import plot_pendulum_single_critic_predictions from gym import LEGGED_GYM_ROOT_DIR import os import torch @@ -13,7 +14,7 @@ DEVICE = "cuda:0" # handle some bookkeeping # -run_name = "May02_08-59-46_standard_critic" +run_name = "May03_17-11-21_standard_critic" # "May02_08-59-46_standard_critic" log_dir = os.path.join( LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name ) @@ -70,12 +71,17 @@ mean_value_loss /= counter # compare new and old critics -vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) # thinking of swtiching this to a list comprehension when we have multiple critics -vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) -plot_pendulum_critic_predictions(x=batch, - predictions=data["values"], - targets=data["returns"], - title=f"{critic_name}_it{iteration}", - fn=f"{critic_name}_it{iteration}", - vmin=vmin, - vmax=vmax) +time_str = time.strftime("%Y%m%d_%H%M%S") +save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str) +if not os.path.exists(save_path): + os.makedirs(save_path) +# TODO: revisit this, TwoSlopeNorm was causing discoloration +# vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) +# vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) +plot_pendulum_single_critic_predictions( + x=data["critic_obs"][-1], + predictions=data["values"][-1], + targets=data["returns"][-1], + title=f"{critic_name}_iteration{iteration}", + fn=save_path + f"/{critic_name}_it{iteration}", +) From 80e44b1aab0e11bea196810905de967319c6c1f0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 2 May 2024 19:53:45 -0400 Subject: [PATCH 60/69] hotfix: don't normalize MC returns --- scripts/offline_critics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index 865cca2e..48ac077f 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -34,7 +34,7 @@ test_critic = eval(f"{critic_name}(**test_critic_params).to(DEVICE)") critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) # load data -iteration = 1 +iteration = 200 data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) gamma = 0.99 lam = 0.99 From af9aa2b3bdd524932e035191ad93c5fe178d0acb Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 3 May 2024 18:15:55 -0400 Subject: [PATCH 61/69] bugfixes, and unit test for MC_returns --- gym/envs/base/task_skeleton.py | 1 + learning/runners/datalogging_runner.py | 1 + learning/utils/dict_utils.py | 12 ++- learning/utils/tests/test_returns.py | 122 +++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 learning/utils/tests/test_returns.py diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index ffbac6ce..038fde5a 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -45,6 +45,7 @@ def reset(self): """Reset all robots""" self._reset_idx(torch.arange(self.num_envs, device=self.device)) self.step() + self.episode_length_buf[:] = 0 def _reset_buffers(self): self.to_be_reset[:] = False diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py index 06b680cd..0560739a 100644 --- a/learning/runners/datalogging_runner.py +++ b/learning/runners/datalogging_runner.py @@ -56,6 +56,7 @@ def learn(self): # burn in observation normalization. if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: self.burn_in_normalization() + self.env.reset() logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index e22dde01..48c20c42 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -5,17 +5,24 @@ @torch.no_grad def compute_MC_returns(data: TensorDict, gamma, critic=None): + # todo not as accurate as taking if critic is None: last_values = torch.zeros_like(data["rewards"][0]) else: last_values = critic.evaluate(data["critic_obs"][-1]) returns = torch.zeros_like(data["rewards"]) - returns[-1] = last_values * ~data["dones"][-1] + returns[-1] = data["rewards"][-1] + gamma * last_values * ~data["terminated"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done - + if critic is not None: + returns[k] += ( + gamma + * critic.evaluate(data["critic_obs"][k]) + * data["timed_out"][k] + * ~data["terminated"][k] + ) return returns @@ -30,7 +37,6 @@ def compute_generalized_advantages(data, gamma, lam, critic): advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this - # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] advantages[-1] = ( data["rewards"][-1] diff --git a/learning/utils/tests/test_returns.py b/learning/utils/tests/test_returns.py new file mode 100644 index 00000000..fab9e3b7 --- /dev/null +++ b/learning/utils/tests/test_returns.py @@ -0,0 +1,122 @@ +import pytest +import torch +from tensordict import TensorDict +from learning.utils import compute_MC_returns + + +class CriticConstant(torch.nn.Module): + def __init__(self, output): + super().__init__() + self.output = output + + def evaluate(self, obs): + return self.output * torch.ones(obs.shape[0]) + + +@pytest.fixture +def setup_data(): + n_timesteps = 4 + n_envs = 5 + rewards = torch.ones((n_timesteps, n_envs)) + terminated = torch.zeros((n_timesteps, n_envs), dtype=torch.bool) + timed_out = torch.zeros_like(terminated) + critic_obs = torch.rand((n_timesteps, n_envs)) + + # Terminating conditions setup + timed_out[-1, [0, 2]] = True + terminated[-1, [1, 2]] = True + timed_out[1, 3] = True + terminated[1, 4] = True + + dones = timed_out | terminated + data = TensorDict( + { + "rewards": rewards, + "timed_out": timed_out, + "terminated": terminated, + "dones": dones, + "critic_obs": critic_obs, + } + ) + + return data + + +def test_critic_always_zero_gamma_zero(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=0.0, critic=critic) + expected_returns = data["rewards"] + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_four_gamma_zero(setup_data): + data = setup_data + critic = CriticConstant(4) + returns = compute_MC_returns(data, gamma=0.0, critic=critic) + expected_returns = data["rewards"] + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_zero_gamma_one(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=1.0, critic=critic) + expected_returns = torch.tensor( + [ + [4.0, 3.0, 2.0, 1.0], + [4.0, 3.0, 2.0, 1.0], + [4.0, 3.0, 2.0, 1.0], + [2.0, 1.0, 2.0, 1.0], + [2.0, 1.0, 2.0, 1.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_four_gamma_one(setup_data): + data = setup_data + critic = CriticConstant(4) + returns = compute_MC_returns(data, gamma=1.0, critic=critic) + expected_returns = torch.tensor( + [ + [8.0, 7.0, 6.0, 5.0], + [4.0, 3.0, 2.0, 1.0], + [4.0, 3.0, 2.0, 1.0], + [6.0, 5.0, 6.0, 5.0], + [2.0, 1.0, 6.0, 5.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_zero_gamma_half(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=0.5, critic=critic) + expected_returns = torch.tensor( + [ + [1.875, 1.75, 1.5, 1.0], + [1.875, 1.75, 1.5, 1.0], + [1.875, 1.75, 1.5, 1.0], + [1.5, 1.0, 1.5, 1.0], + [1.5, 1.0, 1.5, 1.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) + + +def test_critic_always_four_gamma_half(setup_data): + data = setup_data + critic = CriticConstant(6.0) + returns = compute_MC_returns(data, gamma=0.5, critic=critic) + expected_returns = torch.tensor( + [ + [2.25, 2.5, 3.0, 4.0], + [1.875, 1.75, 1.5, 1.0], + [1.875, 1.75, 1.5, 1.0], + [3.0, 4.0, 3.0, 4.0], + [1.5, 1.0, 3.0, 4.0], + ] + ).T + torch.testing.assert_close(returns, expected_returns) From 2b58fce6c10b60b44bd4f7ce398a8dccca691def Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 3 May 2024 18:57:13 -0400 Subject: [PATCH 62/69] fix shapes, add test for GAE with lam=1.0 --- learning/utils/dict_utils.py | 1 + learning/utils/tests/test_returns.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 48c20c42..d73709fa 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -33,6 +33,7 @@ def normalize(input, eps=1e-8): @torch.no_grad def compute_generalized_advantages(data, gamma, lam, critic): + data["values"] = critic.evaluate(data["critic_obs"]) last_values = critic.evaluate(data["next_critic_obs"][-1]) advantages = torch.zeros_like(data["values"]) if last_values is not None: diff --git a/learning/utils/tests/test_returns.py b/learning/utils/tests/test_returns.py index fab9e3b7..1876c123 100644 --- a/learning/utils/tests/test_returns.py +++ b/learning/utils/tests/test_returns.py @@ -1,7 +1,7 @@ import pytest import torch from tensordict import TensorDict -from learning.utils import compute_MC_returns +from learning.utils import compute_MC_returns, compute_generalized_advantages class CriticConstant(torch.nn.Module): @@ -10,7 +10,7 @@ def __init__(self, output): self.output = output def evaluate(self, obs): - return self.output * torch.ones(obs.shape[0]) + return self.output * torch.ones(obs.shape[:-1]) @pytest.fixture @@ -20,7 +20,7 @@ def setup_data(): rewards = torch.ones((n_timesteps, n_envs)) terminated = torch.zeros((n_timesteps, n_envs), dtype=torch.bool) timed_out = torch.zeros_like(terminated) - critic_obs = torch.rand((n_timesteps, n_envs)) + critic_obs = torch.rand((n_timesteps, n_envs, 1)) # Terminating conditions setup timed_out[-1, [0, 2]] = True @@ -36,7 +36,9 @@ def setup_data(): "terminated": terminated, "dones": dones, "critic_obs": critic_obs, - } + "next_critic_obs": torch.roll(critic_obs, shifts=-1, dims=0), + }, + batch_size=(n_timesteps, n_envs), ) return data @@ -120,3 +122,12 @@ def test_critic_always_four_gamma_half(setup_data): ] ).T torch.testing.assert_close(returns, expected_returns) + + +def test_gae_critic_always_zero_lambda_one(setup_data): + data = setup_data + critic = CriticConstant(0) + returns = compute_MC_returns(data, gamma=0.5, critic=critic) + advantages = compute_generalized_advantages(data, gamma=0.5, lam=1.0, critic=critic) + expected_advantages = returns - critic.evaluate(data["critic_obs"]) + torch.testing.assert_close(advantages, expected_advantages) From 7211c30197dc6d235bb2b09bc076a3019f1a7973 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 3 May 2024 19:54:29 -0400 Subject: [PATCH 63/69] log terminated etc. --- learning/runners/datalogging_runner.py | 5 ++++- learning/runners/on_policy_runner.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py index 0560739a..7a227d78 100644 --- a/learning/runners/datalogging_runner.py +++ b/learning/runners/datalogging_runner.py @@ -42,7 +42,9 @@ def learn(self): "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], - "dones": self.get_timed_out(), + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), } ) storage.initialize( @@ -101,6 +103,7 @@ def learn(self): "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, + "terminated": terminated, "dones": dones, } ) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index c1758000..816d03fb 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -42,7 +42,9 @@ def learn(self): "critic_obs": critic_obs, "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], - "dones": self.get_timed_out(), + "timed_out": self.get_timed_out(), + "terminated": self.get_terminated(), + "dones": self.get_timed_out() | self.get_terminated(), } ) storage.initialize( @@ -99,6 +101,7 @@ def learn(self): "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, + "terminated": terminated, "dones": dones, } ) From 124ca3ebe41354838e914567f2a77d0e1b27e126 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Fri, 3 May 2024 20:31:40 -0400 Subject: [PATCH 64/69] WIP training multiple critics at once --- learning/modules/lqrc/plotting.py | 6 +- scripts/multiple_offline_critics.py | 88 +++++++++++++++++++++++++++++ scripts/offline_critics.py | 2 +- 3 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 scripts/multiple_offline_critics.py diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 87b685f4..496ae721 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -13,7 +13,11 @@ matplotlib.rc("font", **font) -def plot_pendulum_single_critic_predictions( +def plot_pendulum_multiple_critics(graphing_data): + pass + + +def plot_pendulum_single_critic( x, predictions, targets, title, fn, colorbar_label="f(x)" ): x = x.detach().cpu().numpy().reshape(-1, 2) diff --git a/scripts/multiple_offline_critics.py b/scripts/multiple_offline_critics.py new file mode 100644 index 00000000..d39eeefb --- /dev/null +++ b/scripts/multiple_offline_critics.py @@ -0,0 +1,88 @@ +import time +from learning.modules.lqrc import Cholesky +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, + create_uniform_generator, +) +from learning.modules.lqrc.plotting import ( + plot_pendulum_single_critic, + plot_pendulum_multiple_critics, +) +from gym import LEGGED_GYM_ROOT_DIR +import os +import torch +from torch import nn + +DEVICE = "cuda:0" +# handle some bookkeeping +run_name = "May03_17-11-21_standard_critic" # "May02_08-59-46_standard_critic" +log_dir = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name +) +time_str = time.strftime("%Y%m%d_%H%M%S") +save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str) +if not os.path.exists(save_path): + os.makedirs(save_path) +critics = ["Cholesky"] # list of strings of the class names of critics to test +graphing_data = {critic: {} for critic in critics} +# create fresh critic +test_critic_params = { + "num_obs": 2, + "hidden_dims": None, + "activation": "elu", + "normalize_obs": False, + "output_size": 1, + "device": DEVICE, +} +learning_rate = 1.0e-4 +tot_iter = 200 +for iteration in range(tot_iter): + # load data + data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) + gamma = 0.99 + lam = 0.99 + + # compute ground-truth + episode_rollouts = compute_MC_returns(data, gamma) + for critic in critics: + test_critic = eval(f"{critic}(**test_critic_params).to(DEVICE)") + critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) + # train new critic + data["values"] = test_critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, gamma, lam, test_critic + ) + data["returns"] = data["advantages"] + data["values"] + + mean_value_loss = 0 + counter = 0 + max_gradient_steps = 24 + max_grad_norm = 1.0 + batch_size = 2**16 + generator = create_uniform_generator( + data, + batch_size, + max_gradient_steps=max_gradient_steps, + ) + for batch in generator: + value_loss = test_critic.loss_fn(batch["critic_obs"], batch["returns"]) + critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) + critic_optimizer.step() + mean_value_loss += value_loss.item() + print("Value loss: ", value_loss.item()) + counter += 1 + mean_value_loss /= counter + graphing_data[critic]["critic_obs"] = data["critic_obs"][-1] + graphing_data[critic]["values"] = data["values"][-1] + graphing_data[critic]["returns"] = data["returns"][-1] + + # compare new and old critics + # TODO: revisit this, TwoSlopeNorm was causing discoloration + # vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) + # vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) + plot_pendulum_multiple_critics( + graphing_data, + ) # TODO: WIP diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index 48ac077f..0df07088 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -5,7 +5,7 @@ compute_MC_returns, create_uniform_generator, ) -from learning.modules.lqrc.plotting import plot_pendulum_single_critic_predictions +from learning.modules.lqrc.plotting import plot_pendulum_single_critic from gym import LEGGED_GYM_ROOT_DIR import os import torch From 6320c606b79f6e53fb79e18fd0af03d1600c36f2 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 3 May 2024 20:58:19 -0400 Subject: [PATCH 65/69] make time-outs also terminations for pendulum --- gym/envs/pendulum/pendulum.py | 4 ++++ scripts/multiple_offline_critics.py | 8 +++----- scripts/offline_critics.py | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 317c7ed0..c63ffa6c 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -9,6 +9,10 @@ def _post_physics_step(self): """Update all states that are not handled in PhysX""" super()._post_physics_step() + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + def reset_to_uniform(self, env_ids): grid_points = int(sqrt(self.num_envs)) lin_pos = torch.linspace( diff --git a/scripts/multiple_offline_critics.py b/scripts/multiple_offline_critics.py index d39eeefb..cc0cc244 100644 --- a/scripts/multiple_offline_critics.py +++ b/scripts/multiple_offline_critics.py @@ -1,14 +1,12 @@ import time -from learning.modules.lqrc import Cholesky +from learning.modules.lqrc import Cholesky # noqa F401 from learning.utils import ( compute_generalized_advantages, compute_MC_returns, create_uniform_generator, ) -from learning.modules.lqrc.plotting import ( - plot_pendulum_single_critic, - plot_pendulum_multiple_critics, -) +from learning.modules.lqrc.plotting import plot_pendulum_multiple_critics + from gym import LEGGED_GYM_ROOT_DIR import os import torch diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index 0df07088..29f17f8c 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -1,5 +1,5 @@ import time -from learning.modules.lqrc import Cholesky +from learning.modules.lqrc import Cholesky # noqa F401 from learning.utils import ( compute_generalized_advantages, compute_MC_returns, @@ -14,7 +14,7 @@ DEVICE = "cuda:0" # handle some bookkeeping # -run_name = "May03_17-11-21_standard_critic" # "May02_08-59-46_standard_critic" +run_name = "May03_20-49-23_standard_critic" # "May02_08-59-46_standard_critic" log_dir = os.path.join( LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name ) @@ -30,7 +30,7 @@ "device": DEVICE, } learning_rate = 1.0e-4 -critic_name = "Cholesky" +critic_name = " # noqa F401" test_critic = eval(f"{critic_name}(**test_critic_params).to(DEVICE)") critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) # load data @@ -78,7 +78,7 @@ # TODO: revisit this, TwoSlopeNorm was causing discoloration # vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) # vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) -plot_pendulum_single_critic_predictions( +plot_pendulum_single_critic( x=data["critic_obs"][-1], predictions=data["values"][-1], targets=data["returns"][-1], From 9882e42d7cfd8bae099f47bcc3fdea796778b801 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Sun, 12 May 2024 13:46:07 -0400 Subject: [PATCH 66/69] all iterations single critic --- scripts/offline_critics.py | 94 +++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index 29f17f8c..8a6bee10 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -13,14 +13,13 @@ DEVICE = "cuda:0" # handle some bookkeeping -# -run_name = "May03_20-49-23_standard_critic" # "May02_08-59-46_standard_critic" +run_name = "May12_13-27-05_standard_critic" # "May03_20-49-23_standard_critic" "May02_08-59-46_standard_critic" log_dir = os.path.join( LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name ) +time_str = time.strftime("%Y%m%d_%H%M%S") # create fresh critic - test_critic_params = { "num_obs": 2, "hidden_dims": None, @@ -30,58 +29,57 @@ "device": DEVICE, } learning_rate = 1.0e-4 -critic_name = " # noqa F401" +critic_name = "Cholesky" test_critic = eval(f"{critic_name}(**test_critic_params).to(DEVICE)") critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) -# load data -iteration = 200 -data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) gamma = 0.99 lam = 0.99 +tot_iter = 200 -# compute ground-truth +for iteration in range(tot_iter): + # load data + data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) -episode_rollouts = compute_MC_returns(data, gamma) + # compute ground-truth + episode_rollouts = compute_MC_returns(data, gamma) -# train new critic + # train new critic + data["values"] = test_critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages(data, gamma, lam, test_critic) + data["returns"] = data["advantages"] + data["values"] -data["values"] = test_critic.evaluate(data["critic_obs"]) -data["advantages"] = compute_generalized_advantages(data, gamma, lam, test_critic) -data["returns"] = data["advantages"] + data["values"] + mean_value_loss = 0 + counter = 0 + max_gradient_steps = 24 + max_grad_norm = 1.0 + batch_size = 2**16 + generator = create_uniform_generator( + data, + batch_size, + max_gradient_steps=max_gradient_steps, + ) + for batch in generator: + value_loss = test_critic.loss_fn(batch["critic_obs"], batch["returns"]) + critic_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) + critic_optimizer.step() + mean_value_loss += value_loss.item() + print("Value loss: ", value_loss.item()) + counter += 1 + mean_value_loss /= counter -mean_value_loss = 0 -counter = 0 -max_gradient_steps = 24 -max_grad_norm = 1.0 -batch_size = 2**16 -generator = create_uniform_generator( - data, - batch_size, - max_gradient_steps=max_gradient_steps, -) -for batch in generator: - value_loss = test_critic.loss_fn(batch["critic_obs"], batch["returns"]) - critic_optimizer.zero_grad() - value_loss.backward() - nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) - critic_optimizer.step() - mean_value_loss += value_loss.item() - print("Value loss: ", value_loss.item()) - counter += 1 -mean_value_loss /= counter + # compare new and old critics + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str + ) + if not os.path.exists(save_path): + os.makedirs(save_path) -# compare new and old critics -time_str = time.strftime("%Y%m%d_%H%M%S") -save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str) -if not os.path.exists(save_path): - os.makedirs(save_path) -# TODO: revisit this, TwoSlopeNorm was causing discoloration -# vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) -# vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) -plot_pendulum_single_critic( - x=data["critic_obs"][-1], - predictions=data["values"][-1], - targets=data["returns"][-1], - title=f"{critic_name}_iteration{iteration}", - fn=save_path + f"/{critic_name}_it{iteration}", -) + plot_pendulum_single_critic( + x=data["critic_obs"][-1], + predictions=data["values"][-1], + targets=data["returns"][-1], + title=f"{critic_name}_iteration{iteration}", + fn=save_path + f"/{critic_name}_it{iteration}", + ) From 2edb4715dbe2f2be14ad5d21832f8b49dbef6a4d Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Sun, 12 May 2024 15:56:27 -0400 Subject: [PATCH 67/69] multiple critic graphing done --- learning/modules/lqrc/__init__.py | 2 +- learning/modules/lqrc/custom_critics.py | 2 +- learning/modules/lqrc/plotting.py | 47 +++++++++++++++++- scripts/lqrc/save_video.py | 33 +++++++++++++ scripts/multiple_offline_critics.py | 63 ++++++++++++++++--------- 5 files changed, 121 insertions(+), 26 deletions(-) create mode 100644 scripts/lqrc/save_video.py diff --git a/learning/modules/lqrc/__init__.py b/learning/modules/lqrc/__init__.py index f7050b41..56cea2c0 100644 --- a/learning/modules/lqrc/__init__.py +++ b/learning/modules/lqrc/__init__.py @@ -1,4 +1,4 @@ -from .custom_nn import CholeskyPlusConst, QuadraticNetCholesky, BaselineMLP, CustomCholeskyLoss, CustomCholeskyPlusConstLoss +from .custom_nn import CholeskyPlusConst, QuadraticNetCholesky, BaselineMLP, CustomCholeskyLoss, CustomCholeskyPlusConstLoss # deprecated but left here until there's time for clean up from .utils import benchmark_args from .custom_critics import CustomCriticBaseline, Cholesky, CholeskyPlusConst, CholeskyOffset1, CholeskyOffset2 from .autoencoder import Autoencoder \ No newline at end of file diff --git a/learning/modules/lqrc/custom_critics.py b/learning/modules/lqrc/custom_critics.py index 1d1e267d..764980d6 100644 --- a/learning/modules/lqrc/custom_critics.py +++ b/learning/modules/lqrc/custom_critics.py @@ -156,7 +156,7 @@ def hook(module, input, output): return hook def loss_fn(self, input, target): - loss = F.mse_loss(input, target, reduction="mean") + loss = F.mse_loss(self.evaluate(input), target, reduction="mean") const_loss = self.const_penalty * torch.mean(self.intermediates["c"]) return loss + const_loss diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 496ae721..1171101c 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -13,8 +13,51 @@ matplotlib.rc("font", **font) -def plot_pendulum_multiple_critics(graphing_data): - pass +def plot_pendulum_multiple_critics( + x, predictions, targets, title, fn, colorbar_label="f(x)" +): + num_critics = len(x.keys()) + assert ( + num_critics >= 1 + ), "This function requires at least two critics for graphing. To graph a single critic please use its corresponding graphing function." + fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(6 * num_critics, 10)) + error = {} + for ix, critic_name in enumerate(x): + np_x = x[critic_name].detach().cpu().numpy().reshape(-1, 2) + np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) + np_targets = targets[critic_name].detach().cpu().numpy().reshape(-1) + np_error = np_predictions - np_targets + + # predictions + pred_scatter = axes[0, ix].scatter( + np_x[:, 0], + np_x[:, 1], + c=np_predictions, + cmap="PiYG", + alpha=0.5, + norm=CenteredNorm(), + ) + axes[0, ix].set_title(f"{critic_name} Prediction") + + # error + error_scatter = axes[1, ix].scatter( + np_x[:, 0], + np_x[:, 1], + c=np_error, + cmap="bwr", + alpha=0.5, + norm=CenteredNorm(), + ) + axes[1, ix].set_title(f"{critic_name} Error") + fig.colorbar( + pred_scatter, ax=axes[0, :].ravel().tolist(), shrink=0.95, label=colorbar_label + ) + fig.colorbar( + error_scatter, ax=axes[1, :].ravel().tolist(), shrink=0.95, label=colorbar_label + ) + fig.suptitle(title, fontsize=16) + plt.savefig(f"{fn}.png") + print(f"Saved to {fn}.png") def plot_pendulum_single_critic( diff --git a/scripts/lqrc/save_video.py b/scripts/lqrc/save_video.py new file mode 100644 index 00000000..8cbeb8d4 --- /dev/null +++ b/scripts/lqrc/save_video.py @@ -0,0 +1,33 @@ +import cv2 +import os +from gym import LEGGED_GYM_ROOT_DIR +import re + + +def extract_iteration(filename): + # Use regular expression to find the numbers following 'it' + match = re.search(r"it(\d+)", filename) + if match: + return int(match.group(1)) + return 0 # Return 0 or some default value if 'it' followed by digits is not found + + +image_folder = ( + os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph") + + "/20240512_153648" +) +video_name = "video.avi" +img_names = os.listdir(image_folder) +# Sort the list using the custom key function +img_names = sorted(img_names, key=extract_iteration) +images = [img for img in img_names if img.endswith(".png")] +frame = cv2.imread(os.path.join(image_folder, images[0])) +height, width, layers = frame.shape + +video = cv2.VideoWriter(video_name, 0, 10, (width, height)) + +for image in images: + video.write(cv2.imread(os.path.join(image_folder, image))) + +cv2.destroyAllWindows() +video.release() diff --git a/scripts/multiple_offline_critics.py b/scripts/multiple_offline_critics.py index cc0cc244..27db8558 100644 --- a/scripts/multiple_offline_critics.py +++ b/scripts/multiple_offline_critics.py @@ -1,12 +1,17 @@ import time -from learning.modules.lqrc import Cholesky # noqa F401 +from learning.modules.lqrc import ( + CustomCriticBaseline, + Cholesky, + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2, +) # noqa F401 from learning.utils import ( compute_generalized_advantages, compute_MC_returns, create_uniform_generator, ) from learning.modules.lqrc.plotting import plot_pendulum_multiple_critics - from gym import LEGGED_GYM_ROOT_DIR import os import torch @@ -14,16 +19,12 @@ DEVICE = "cuda:0" # handle some bookkeeping -run_name = "May03_17-11-21_standard_critic" # "May02_08-59-46_standard_critic" +run_name = "May12_13-27-05_standard_critic" # "May03_20-49-23_standard_critic" "May02_08-59-46_standard_critic" log_dir = os.path.join( LEGGED_GYM_ROOT_DIR, "logs", "pendulum_standard_critic", run_name ) time_str = time.strftime("%Y%m%d_%H%M%S") -save_path = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str) -if not os.path.exists(save_path): - os.makedirs(save_path) -critics = ["Cholesky"] # list of strings of the class names of critics to test -graphing_data = {critic: {} for critic in critics} + # create fresh critic test_critic_params = { "num_obs": 2, @@ -34,18 +35,28 @@ "device": DEVICE, } learning_rate = 1.0e-4 +critic_names = ["Cholesky", "CholeskyPlusConst", "CholeskyOffset1", "CholeskyOffset2"] +test_critics = { + name: eval(f"{name}(**test_critic_params).to(DEVICE)") for name in critic_names +} +critic_optimizers = { + name: torch.optim.Adam(critic.parameters(), lr=learning_rate) + for name, critic in test_critics.items() +} +gamma = 0.99 +lam = 0.99 tot_iter = 200 + for iteration in range(tot_iter): # load data data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) - gamma = 0.99 - lam = 0.99 # compute ground-truth episode_rollouts = compute_MC_returns(data, gamma) - for critic in critics: - test_critic = eval(f"{critic}(**test_critic_params).to(DEVICE)") - critic_optimizer = torch.optim.Adam(test_critic.parameters(), lr=learning_rate) + + graphing_data = {data_name: {} for data_name in ["critic_obs", "values", "returns"]} + for name, test_critic in test_critics.items(): + critic_optimizer = critic_optimizers[name] # train new critic data["values"] = test_critic.evaluate(data["critic_obs"]) data["advantages"] = compute_generalized_advantages( @@ -70,17 +81,25 @@ nn.utils.clip_grad_norm_(test_critic.parameters(), max_grad_norm) critic_optimizer.step() mean_value_loss += value_loss.item() - print("Value loss: ", value_loss.item()) + print(f"{name} value loss: ", value_loss.item()) counter += 1 mean_value_loss /= counter - graphing_data[critic]["critic_obs"] = data["critic_obs"][-1] - graphing_data[critic]["values"] = data["values"][-1] - graphing_data[critic]["returns"] = data["returns"][-1] + + graphing_data["critic_obs"][name] = data[-1, :]["critic_obs"] + graphing_data["values"][name] = data[-1, :]["values"] + graphing_data["returns"][name] = data[-1, :]["returns"] # compare new and old critics - # TODO: revisit this, TwoSlopeNorm was causing discoloration - # vmin = min(torch.min(data["returns"]).item(), torch.min(data["values"]).item()) - # vmax = max(torch.max(data["returns"]).item(), torch.max(data["values"]).item()) + save_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph", time_str + ) + if not os.path.exists(save_path): + os.makedirs(save_path) + plot_pendulum_multiple_critics( - graphing_data, - ) # TODO: WIP + graphing_data["critic_obs"], + graphing_data["values"], + graphing_data["returns"], + title=f"iteration{iteration}", + fn=save_path + f"/{len(critic_names)}_critics_it{iteration}", + ) From 1c462758235150c8be4c7590fd34fa106b316677 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Sun, 12 May 2024 21:08:44 -0400 Subject: [PATCH 68/69] added ground truth + vanilla critic and fixed indexing --- learning/modules/lqrc/plotting.py | 37 +++++++++++++------------ scripts/lqrc/save_video.py | 4 +-- scripts/multiple_offline_critics.py | 43 +++++++++++++++++++++++------ scripts/offline_critics.py | 6 ++-- 4 files changed, 59 insertions(+), 31 deletions(-) diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 1171101c..86dd04eb 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -2,7 +2,7 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np -from matplotlib.colors import CenteredNorm +from matplotlib.colors import CenteredNorm, TwoSlopeNorm matplotlib.rcParams["pdf.fonttype"] = 42 matplotlib.rcParams["ps.fonttype"] = 42 @@ -29,26 +29,29 @@ def plot_pendulum_multiple_critics( np_error = np_predictions - np_targets # predictions - pred_scatter = axes[0, ix].scatter( - np_x[:, 0], - np_x[:, 1], - c=np_predictions, - cmap="PiYG", - alpha=0.5, - norm=CenteredNorm(), - ) + if np.any(~np.equal(np_predictions, np.zeros_like(np_predictions))): + pred_scatter = axes[0, ix].scatter( + np_x[:, 0], + np_x[:, 1], + c=np_predictions, + cmap="PiYG", + alpha=0.5, + norm=CenteredNorm(), + ) axes[0, ix].set_title(f"{critic_name} Prediction") # error - error_scatter = axes[1, ix].scatter( - np_x[:, 0], - np_x[:, 1], - c=np_error, - cmap="bwr", - alpha=0.5, - norm=CenteredNorm(), - ) + if np.any(~np.equal(np_error, np.zeros_like(np_error))): + error_scatter = axes[1, ix].scatter( + np_x[:, 0], + np_x[:, 1], + c=np_error, + cmap="bwr", + alpha=0.5, + norm=CenteredNorm(), + ) axes[1, ix].set_title(f"{critic_name} Error") + fig.colorbar( pred_scatter, ax=axes[0, :].ravel().tolist(), shrink=0.95, label=colorbar_label ) diff --git a/scripts/lqrc/save_video.py b/scripts/lqrc/save_video.py index 8cbeb8d4..5a98e86f 100644 --- a/scripts/lqrc/save_video.py +++ b/scripts/lqrc/save_video.py @@ -14,9 +14,9 @@ def extract_iteration(filename): image_folder = ( os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph") - + "/20240512_153648" + + "/20240512_205525" ) -video_name = "video.avi" +video_name = f"{image_folder}/video.avi" img_names = os.listdir(image_folder) # Sort the list using the custom key function img_names = sorted(img_names, key=extract_iteration) diff --git a/scripts/multiple_offline_critics.py b/scripts/multiple_offline_critics.py index 27db8558..0465a7dc 100644 --- a/scripts/multiple_offline_critics.py +++ b/scripts/multiple_offline_critics.py @@ -1,4 +1,5 @@ import time +from learning.modules.critic import Critic from learning.modules.lqrc import ( CustomCriticBaseline, Cholesky, @@ -11,7 +12,9 @@ compute_MC_returns, create_uniform_generator, ) -from learning.modules.lqrc.plotting import plot_pendulum_multiple_critics +from learning.modules.lqrc.plotting import ( + plot_pendulum_multiple_critics, +) from gym import LEGGED_GYM_ROOT_DIR import os import torch @@ -26,7 +29,7 @@ time_str = time.strftime("%Y%m%d_%H%M%S") # create fresh critic -test_critic_params = { +custom_critic_params = { "num_obs": 2, "hidden_dims": None, "activation": "elu", @@ -34,17 +37,35 @@ "output_size": 1, "device": DEVICE, } +vanilla_critic_params = { + "num_obs": 2, + "hidden_dims": [128, 64, 32], + "activation": "elu", + "normalize_obs": False, + "output_size": 1, + "device": DEVICE, +} learning_rate = 1.0e-4 -critic_names = ["Cholesky", "CholeskyPlusConst", "CholeskyOffset1", "CholeskyOffset2"] +critic_names = [ + "Critic", + "Cholesky", + "CholeskyPlusConst", + "CholeskyOffset1", + "CholeskyOffset2", +] test_critics = { - name: eval(f"{name}(**test_critic_params).to(DEVICE)") for name in critic_names + name: eval(f"{name}(**custom_critic_params).to(DEVICE)") + if not name == "Critic" + else eval(f"{name}(**vanilla_critic_params).to(DEVICE)") + for name in critic_names } + critic_optimizers = { name: torch.optim.Adam(critic.parameters(), lr=learning_rate) for name, critic in test_critics.items() } gamma = 0.99 -lam = 0.99 +lam = 1.0 tot_iter = 200 for iteration in range(tot_iter): @@ -52,9 +73,13 @@ data = torch.load(os.path.join(log_dir, "data_{}.pt".format(iteration))).to(DEVICE) # compute ground-truth + graphing_data = {data_name: {} for data_name in ["critic_obs", "values", "returns"]} + episode_rollouts = compute_MC_returns(data, gamma) + graphing_data["critic_obs"]["Ground Truth MC Returns"] = data[0, :]["critic_obs"] + graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] - graphing_data = {data_name: {} for data_name in ["critic_obs", "values", "returns"]} for name, test_critic in test_critics.items(): critic_optimizer = critic_optimizers[name] # train new critic @@ -85,9 +110,9 @@ counter += 1 mean_value_loss /= counter - graphing_data["critic_obs"][name] = data[-1, :]["critic_obs"] - graphing_data["values"][name] = data[-1, :]["values"] - graphing_data["returns"][name] = data[-1, :]["returns"] + graphing_data["critic_obs"][name] = data[0, :]["critic_obs"] + graphing_data["values"][name] = data[0, :]["values"] + graphing_data["returns"][name] = data[0, :]["returns"] # compare new and old critics save_path = os.path.join( diff --git a/scripts/offline_critics.py b/scripts/offline_critics.py index 8a6bee10..1a2a3e7a 100644 --- a/scripts/offline_critics.py +++ b/scripts/offline_critics.py @@ -77,9 +77,9 @@ os.makedirs(save_path) plot_pendulum_single_critic( - x=data["critic_obs"][-1], - predictions=data["values"][-1], - targets=data["returns"][-1], + x=data["critic_obs"][0], + predictions=data["values"][0], + targets=data["returns"][0], title=f"{critic_name}_iteration{iteration}", fn=save_path + f"/{critic_name}_it{iteration}", ) From 643f55092ad16ea4c7dff4380b58d7fa89657408 Mon Sep 17 00:00:00 2001 From: Julia Schneider <54411881+jschneider03@users.noreply.github.com> Date: Wed, 15 May 2024 16:09:31 -0400 Subject: [PATCH 69/69] quick fixes + clean up --- gym/envs/pendulum/pendulum_config.py | 2 +- learning/modules/lqrc/plotting.py | 1 - learning/runners/datalogging_runner.py | 19 +++++++++++++++++++ scripts/lqrc/save_video.py | 8 +++++--- scripts/multiple_offline_critics.py | 2 +- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index a45f21cc..ba7f42c7 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -79,7 +79,7 @@ class noise: dof_vel = 0.0 class critic: - critic_class_name = "" + critic_class_name = "CholeskyOffset1" normalize_obs = True obs = [ "dof_pos", diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py index 86dd04eb..9dde12df 100644 --- a/learning/modules/lqrc/plotting.py +++ b/learning/modules/lqrc/plotting.py @@ -21,7 +21,6 @@ def plot_pendulum_multiple_critics( num_critics >= 1 ), "This function requires at least two critics for graphing. To graph a single critic please use its corresponding graphing function." fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(6 * num_critics, 10)) - error = {} for ix, critic_name in enumerate(x): np_x = x[critic_name].detach().cpu().numpy().reshape(-1, 2) np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) diff --git a/learning/runners/datalogging_runner.py b/learning/runners/datalogging_runner.py index 7a227d78..d7847362 100644 --- a/learning/runners/datalogging_runner.py +++ b/learning/runners/datalogging_runner.py @@ -5,6 +5,15 @@ from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor, Critic # noqa: F401 +from learning.modules.lqrc import ( + CustomCriticBaseline, + Cholesky, + CholeskyPlusConst, + CholeskyOffset1, + CholeskyOffset2, +) # noqa F401 from learning.storage import DictStorage from learning.utils import export_to_numpy @@ -22,6 +31,16 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + critic_class_name = self.critic_cfg["critic_class_name"] + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = eval(f"{critic_class_name}(num_critic_obs, **self.critic_cfg)") + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + def learn(self): self.set_up_logger() diff --git a/scripts/lqrc/save_video.py b/scripts/lqrc/save_video.py index 5a98e86f..755f687d 100644 --- a/scripts/lqrc/save_video.py +++ b/scripts/lqrc/save_video.py @@ -14,13 +14,15 @@ def extract_iteration(filename): image_folder = ( os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "offline_critics_graph") - + "/20240512_205525" + + "/20240512_223831" ) -video_name = f"{image_folder}/video.avi" +video_name = f"{image_folder}/video_100.avi" img_names = os.listdir(image_folder) # Sort the list using the custom key function img_names = sorted(img_names, key=extract_iteration) -images = [img for img in img_names if img.endswith(".png")] +images = [ + img for img in img_names if img.endswith(".png") and extract_iteration(img) > 60 +] frame = cv2.imread(os.path.join(image_folder, images[0])) height, width, layers = frame.shape diff --git a/scripts/multiple_offline_critics.py b/scripts/multiple_offline_critics.py index 0465a7dc..f190f983 100644 --- a/scripts/multiple_offline_critics.py +++ b/scripts/multiple_offline_critics.py @@ -125,6 +125,6 @@ graphing_data["critic_obs"], graphing_data["values"], graphing_data["returns"], - title=f"iteration{iteration}", + title=f"Neural Network Training Iteration {iteration}", fn=save_path + f"/{len(critic_names)}_critics_it{iteration}", )