From bb85463c8c8cbd981673262a396f615ee38ba3fa Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 28 Feb 2024 13:30:12 -0500 Subject: [PATCH 01/42] smooth noise sampling and started gSDE --- .gitignore | 2 + .../mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/algorithms/ppo.py | 2 +- learning/modules/actor.py | 19 ++ learning/modules/actor_critic.py | 15 ++ learning/modules/utils/__init__.py | 1 + learning/modules/utils/gSDE.py | 191 ++++++++++++++++++ learning/modules/utils/neural_net.py | 9 +- learning/runners/on_policy_runner.py | 4 + plots/plot.py | 19 ++ 10 files changed, 260 insertions(+), 4 deletions(-) create mode 100644 learning/modules/utils/gSDE.py create mode 100644 plots/plot.py diff --git a/.gitignore b/.gitignore index 0b68375a..d2761927 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ +plots/*.csv +plots/*.png # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41bb634f..4db376ce 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" class policy(MiniCheetahRunnerCfg.policy): actor_hidden_dims = [256, 256, 128] diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 9c7454bd..3ca5a5b4 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -152,7 +152,7 @@ def update(self): old_mu_batch, old_sigma_batch, ) in generator: - self.actor_critic.act(obs_batch) + self.actor_critic.actor.update_distribution(obs_batch) actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index f9c82396..e588ea5e 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -37,6 +37,9 @@ def __init__( self.distribution = None # disable args validation for speedup Normal.set_default_validate_args = False + # smooth exploration + self.use_smooth_expl = True + self.episode_noise = None @property def action_mean(self): @@ -58,8 +61,24 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) + if self.use_smooth_expl: + return self.act_smooth() return self.distribution.sample() + def act_smooth(self): + mean = self.distribution.mean + if self.episode_noise is None: + sample = self.distribution.sample() + self.episode_noise = sample - self.distribution.mean + else: + sample = mean + self.episode_noise + # write to csv + with open( + "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a" + ) as f: + f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + return sample + def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index ed837f5e..91365fcc 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -2,6 +2,7 @@ from .actor import Actor from .critic import Critic +from .utils import StateDependentNoiseDistribution class ActorCritic(nn.Module): @@ -15,6 +16,7 @@ def __init__( activation="elu", init_noise_std=1.0, normalize_obs=True, + use_sde=True, **kwargs, ): if kwargs: @@ -40,6 +42,19 @@ def __init__( print(f"Actor MLP: {self.actor.NN}") print(f"Critic MLP: {self.critic.NN}") + # TODO[lm]: Decide how to handle the state dependent noise distribution in + # this class. In stable-baselines3 there is a class MlpExtractor which does + # what the Actor and Critic classes do here, just with the latent representation + # of the networks. Either I make a new class in a similar way and store the + # action distribution here, or I make the changes in Actor and Critic and change + # the distribution there. + if use_sde: + self.action_dist = StateDependentNoiseDistribution( + num_actions, + num_actor_obs, + num_critic_obs, + ) + @property def action_mean(self): return self.actor.action_mean diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py index 1422b82d..3bc82600 100644 --- a/learning/modules/utils/__init__.py +++ b/learning/modules/utils/__init__.py @@ -1,2 +1,3 @@ from .neural_net import create_MLP, export_network from .normalize import RunningMeanStd +from .gSDE import StateDependentNoiseDistribution diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py new file mode 100644 index 00000000..2920b796 --- /dev/null +++ b/learning/modules/utils/gSDE.py @@ -0,0 +1,191 @@ +from typing import Tuple, Optional + +import torch +import torch.nn as nn +from torch.distributions import Normal + + +# The follow implementation was used in the gSDE paper on smooth exploration. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +# TODO[lm]: Need to update some of the naming conventions to fit our codebase. +class StateDependentNoiseDistribution: + """ + Distribution class for using generalized State Dependent Exploration (gSDE). + It is used to create the noise exploration matrix and compute the log + probability of an action with that noise. + """ + + latent_sde_dim: Optional[int] + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_mat: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + num_actions: int, + num_actor_obs: int, + num_critic_obs: int, + full_std: bool = True, + use_expln: bool = False, + learn_features: bool = False, + epsilon: float = 1e-6, + ): + self.num_actions = num_actions + self.num_actor_obs = num_actor_obs + self.num_critic_obs = num_critic_obs + self.full_std = full_std + self.use_expln = use_expln + self.learn_features = learn_features + self.epsilon = epsilon + self.mean_actions = None + self.log_std = None + + def get_std(self, log_std: torch.Tensor) -> torch.Tensor: + """ + Get the standard deviation from the learned parameter + (log of it by default). This ensures that the std is positive. + + :param log_std: + :return: + """ + if self.use_expln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(log_std) * (log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = log_std * (log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(log_std) + + if self.full_std: + return std + assert self.latent_sde_dim is not None + # Reduce the number of parameters: + return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std + + def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: + """ + Sample weights for the noise exploration matrix, + using a centered Gaussian distribution. + + :param log_std: + :param batch_size: + """ + std = self.get_std(log_std) + self.weights_dist = Normal(torch.zeros_like(std), std) + # Reparametrization trick to pass gradients + self.exploration_mat = self.weights_dist.rsample() + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + def proba_distribution_net( + self, + latent_dim: int, + log_std_init: float = -2.0, + latent_sde_dim: Optional[int] = None, + ) -> Tuple[nn.Module, nn.Parameter]: + """ + Create the layers and parameter that represent the distribution: + one output will be the deterministic action, the other parameter will be the + standard deviation of the distribution that control the weights of the noise + matrix. + + :param latent_dim: Dimension of the last layer of the policy (before the + action layer) + :param log_std_init: Initial value for the log standard deviation + :param latent_sde_dim: Dimension of the last layer of the features extractor + for gSDE. By default, it is shared with the policy network. + :return: + """ + # Network for the deterministic action, it represents the mean of the + # distribution + mean_actions_net = nn.Linear(latent_dim, self.action_dim) + # When we learn features for the noise, the feature dimension + # can be different between the policy and the noise network + self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim + # Reduce the number of parameters if needed + log_std = ( + torch.ones(self.latent_sde_dim, self.action_dim) + if self.full_std + else torch.ones(self.latent_sde_dim, 1) + ) + # Transform it to a parameter so it can be optimized + log_std = nn.Parameter(log_std * log_std_init, requires_grad=True) + # Sample an exploration matrix + self.sample_weights(log_std) + return mean_actions_net, log_std + + def proba_distribution( + self, + mean_actions: torch.Tensor, + log_std: torch.Tensor, + latent_sde: torch.Tensor, + ) -> "StateDependentNoiseDistribution": + """ + Create the distribution given its parameters (mean, std) + + :param mean_actions: + :param log_std: + :param latent_sde: + :return: + """ + # Stop gradient if we don't want to influence the features + self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() + variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + return self + + def log_prob(self, actions: torch.Tensor): + # TODO[lm]: verify that summed correctly (stable-baselines3 is different) + return self.distribution.log_prob(actions).sum(-1) + + def entropy(self): + # TODO[lm]: verify that summed correctly (stable-baselines3 is different) + return self.distribution.entropy().sum(-1) + + def sample(self) -> torch.Tensor: + noise = self.get_noise(self._latent_sde) + actions = self.distribution.mean + noise + return actions + + def mode(self) -> torch.Tensor: + return self.distribution.mean + + def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor: + latent_sde = latent_sde if self.learn_features else latent_sde.detach() + # Default case: only one exploration matrix + if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): + return torch.mm(latent_sde, self.exploration_mat) + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices) + return noise.squeeze(dim=1) + + def actions_from_params( + self, + mean_actions: torch.Tensor, + log_std: torch.Tensor, + latent_sde: torch.Tensor, + deterministic: bool = False, + ) -> torch.Tensor: + # Update the proba distribution + self.proba_distribution(mean_actions, log_std, latent_sde) + if deterministic: + return self.mode() + return self.sample() + + def log_prob_from_params( + self, + mean_actions: torch.Tensor, + log_std: torch.Tensor, + latent_sde: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + actions = self.actions_from_params(mean_actions, log_std, latent_sde) + log_prob = self.log_prob(actions) + return actions, log_prob diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 5fd6ab3b..fa25d77b 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -3,7 +3,9 @@ import copy -def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): +def create_MLP( + num_inputs, num_outputs, hidden_dims, activation, dropouts=None, latent=False +): activation = get_activation(activation) if dropouts is None: @@ -15,7 +17,10 @@ def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): else: add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0]) for i in range(len(hidden_dims)): - if i == len(hidden_dims) - 1: + # TODO[lm]: Could also create a separate function that gives the latent + # reprentation used for smooth exploration (but if it doesn't mess up + # anything, this is simpler) + if i == len(hidden_dims) - 1 and not latent: add_layer(layers, hidden_dims[i], num_outputs) else: add_layer( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index e2fb524e..9bf7c9f7 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,6 +37,10 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # Reset noise for smooth exploration + if self.alg.actor_critic.actor.use_smooth_expl and i == 0: + self.alg.actor_critic.actor.episode_noise = None + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.policy_cfg["actions"], diff --git a/plots/plot.py b/plots/plot.py new file mode 100644 index 00000000..2a33eafa --- /dev/null +++ b/plots/plot.py @@ -0,0 +1,19 @@ +import pandas as pd +import matplotlib.pyplot as plt + +# Read the CSV file +name = "distribution_smooth" +data = pd.read_csv(name + ".csv") + +# Plot the data +n = 200 +plt.plot(data.iloc[:n, 0]) +plt.plot(data.iloc[:n, 1]) +plt.xlabel("timestep") +plt.ylabel("action") +plt.title("Smoothing every rollout") +plt.legend(["mean", "sample"]) +# plt.show() + +# Save the plot to a file +plt.savefig(name + ".png") From 243cf0f787b268d4b6f75e35b6524e3de911108b Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 10:41:25 -0500 Subject: [PATCH 02/42] adress comments --- gym/envs/base/legged_robot_config.py | 1 + gym/envs/mini_cheetah/mini_cheetah_config.py | 1 + .../mini_cheetah/mini_cheetah_osc_config.py | 1 + .../mini_cheetah/mini_cheetah_ref_config.py | 1 + gym/envs/mit_humanoid/mit_humanoid_config.py | 1 + learning/modules/actor.py | 44 +++++++++++-------- learning/modules/actor_critic.py | 32 +++++++++----- learning/modules/utils/gSDE.py | 29 +++--------- learning/runners/on_policy_runner.py | 2 +- 9 files changed, 58 insertions(+), 54 deletions(-) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 0de83e0e..144f8323 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -240,6 +240,7 @@ class policy: # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True + smooth_exploration = False actor_obs = [ "observation_a", diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index de015b21..ef9896e0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,6 +130,7 @@ class policy(LeggedRobotRunnerCfg.policy): critic_hidden_dims = [128, 64] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False actor_obs = [ "base_lin_vel", diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..45191a9f 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -166,6 +166,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False actor_obs = [ "base_ang_vel", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 4db376ce..2f894bde 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,6 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = True actor_obs = [ "base_ang_vel", diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..25db6163 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -187,6 +187,7 @@ class policy(LeggedRobotRunnerCfg.policy): critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False actor_obs = [ "base_height", diff --git a/learning/modules/actor.py b/learning/modules/actor.py index e588ea5e..857d7326 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,6 +5,8 @@ from .utils import export_network from .utils import RunningMeanStd +from gym import LEGGED_GYM_ROOT_DIR + class Actor(nn.Module): def __init__( @@ -37,9 +39,6 @@ def __init__( self.distribution = None # disable args validation for speedup Normal.set_default_validate_args = False - # smooth exploration - self.use_smooth_expl = True - self.episode_noise = None @property def action_mean(self): @@ -61,24 +60,8 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - if self.use_smooth_expl: - return self.act_smooth() return self.distribution.sample() - def act_smooth(self): - mean = self.distribution.mean - if self.episode_noise is None: - sample = self.distribution.sample() - self.episode_noise = sample - self.distribution.mean - else: - sample = mean + self.episode_noise - # write to csv - with open( - "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a" - ) as f: - f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") - return sample - def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) @@ -97,3 +80,26 @@ def normalize(self, observation): def export(self, path): export_network(self, "policy", path, self.num_obs) + + +class SmoothActor(Actor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Noise for smooth exploration + self.episode_noise = None + self.debug = False + + def act(self, observations): + self.update_distribution(observations) + mean = self.distribution.mean + if self.episode_noise is None: + sample = self.distribution.sample() + self.episode_noise = sample - self.distribution.mean + else: + sample = mean + self.episode_noise + + if self.debug: + # write to csv (used for plotting) + with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: + f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + return sample diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 91365fcc..669a9f6a 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,6 +1,6 @@ import torch.nn as nn -from .actor import Actor +from .actor import Actor, SmoothActor from .critic import Critic from .utils import StateDependentNoiseDistribution @@ -16,7 +16,7 @@ def __init__( activation="elu", init_noise_std=1.0, normalize_obs=True, - use_sde=True, + smooth_exploration=False, **kwargs, ): if kwargs: @@ -26,14 +26,24 @@ def __init__( ) super(ActorCritic, self).__init__() - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) + if smooth_exploration: + self.actor = SmoothActor( + num_actor_obs, + num_actions, + actor_hidden_dims, + activation, + init_noise_std, + normalize_obs, + ) + else: + self.actor = Actor( + num_actor_obs, + num_actions, + actor_hidden_dims, + activation, + init_noise_std, + normalize_obs, + ) self.critic = Critic( num_critic_obs, critic_hidden_dims, activation, normalize_obs @@ -48,7 +58,7 @@ def __init__( # of the networks. Either I make a new class in a similar way and store the # action distribution here, or I make the changes in Actor and Critic and change # the distribution there. - if use_sde: + if smooth_exploration: self.action_dist = StateDependentNoiseDistribution( num_actions, num_actor_obs, diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py index 2920b796..6e04b0e9 100644 --- a/learning/modules/utils/gSDE.py +++ b/learning/modules/utils/gSDE.py @@ -5,7 +5,8 @@ from torch.distributions import Normal -# The follow implementation was used in the gSDE paper on smooth exploration. See code: +# The following implementation was used in the gSDE paper on smooth exploration. +# See code: # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 # TODO[lm]: Need to update some of the naming conventions to fit our codebase. class StateDependentNoiseDistribution: @@ -45,9 +46,6 @@ def get_std(self, log_std: torch.Tensor) -> torch.Tensor: """ Get the standard deviation from the learned parameter (log of it by default). This ensures that the std is positive. - - :param log_std: - :return: """ if self.use_expln: # From gSDE paper, it allows to keep variance @@ -71,9 +69,6 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: """ Sample weights for the noise exploration matrix, using a centered Gaussian distribution. - - :param log_std: - :param batch_size: """ std = self.get_std(log_std) self.weights_dist = Normal(torch.zeros_like(std), std) @@ -82,7 +77,7 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size,)) - def proba_distribution_net( + def get_distribution_net( self, latent_dim: int, log_std_init: float = -2.0, @@ -93,13 +88,6 @@ def proba_distribution_net( one output will be the deterministic action, the other parameter will be the standard deviation of the distribution that control the weights of the noise matrix. - - :param latent_dim: Dimension of the last layer of the policy (before the - action layer) - :param log_std_init: Initial value for the log standard deviation - :param latent_sde_dim: Dimension of the last layer of the features extractor - for gSDE. By default, it is shared with the policy network. - :return: """ # Network for the deterministic action, it represents the mean of the # distribution @@ -119,7 +107,7 @@ def proba_distribution_net( self.sample_weights(log_std) return mean_actions_net, log_std - def proba_distribution( + def get_distribution( self, mean_actions: torch.Tensor, log_std: torch.Tensor, @@ -127,11 +115,6 @@ def proba_distribution( ) -> "StateDependentNoiseDistribution": """ Create the distribution given its parameters (mean, std) - - :param mean_actions: - :param log_std: - :param latent_sde: - :return: """ # Stop gradient if we don't want to influence the features self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() @@ -174,8 +157,8 @@ def actions_from_params( latent_sde: torch.Tensor, deterministic: bool = False, ) -> torch.Tensor: - # Update the proba distribution - self.proba_distribution(mean_actions, log_std, latent_sde) + # Update the distribution + self.get_distribution(mean_actions, log_std, latent_sde) if deterministic: return self.mode() return self.sample() diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9bf7c9f7..1cc3e0ef 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -38,7 +38,7 @@ def learn(self): with torch.inference_mode(): for i in range(self.num_steps_per_env): # Reset noise for smooth exploration - if self.alg.actor_critic.actor.use_smooth_expl and i == 0: + if self.policy_cfg["smooth_exploration"] and i == 0: self.alg.actor_critic.actor.episode_noise = None actions = self.alg.act(actor_obs, critic_obs) From d838dc6d140b648f84300efd4f92c9f714745e1c Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 11:34:49 -0500 Subject: [PATCH 03/42] start moving things to SmoothActor --- learning/modules/actor.py | 22 +++++++++++++++++++++- learning/modules/actor_critic.py | 14 -------------- learning/modules/critic.py | 2 +- learning/modules/utils/gSDE.py | 2 -- learning/modules/utils/neural_net.py | 5 +++-- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 857d7326..c8ffe255 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -4,6 +4,7 @@ from .utils import create_MLP from .utils import export_network from .utils import RunningMeanStd +from .utils import StateDependentNoiseDistribution from gym import LEGGED_GYM_ROOT_DIR @@ -32,7 +33,11 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.hidden_dims = hidden_dims + self.activation = activation + self.NN = create_MLP( + num_obs, num_actions, hidden_dims, activation, latent=False + ) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) @@ -85,11 +90,26 @@ def export(self, path): class SmoothActor(Actor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # Create latent NN + self.NN = create_MLP( + self.num_obs, + self.num_actions, + self.hidden_dims, + self.activation, + latent=True, + ) + # State dependent action distribution + self.distribution = StateDependentNoiseDistribution( + self.num_actions, + self.num_obs, + ) # Noise for smooth exploration self.episode_noise = None + # Debug mode for plotting self.debug = False def act(self, observations): + # TODO[lm]: update distribution for gSDE self.update_distribution(observations) mean = self.distribution.mean if self.episode_noise is None: diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 669a9f6a..770fdc71 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -2,7 +2,6 @@ from .actor import Actor, SmoothActor from .critic import Critic -from .utils import StateDependentNoiseDistribution class ActorCritic(nn.Module): @@ -52,19 +51,6 @@ def __init__( print(f"Actor MLP: {self.actor.NN}") print(f"Critic MLP: {self.critic.NN}") - # TODO[lm]: Decide how to handle the state dependent noise distribution in - # this class. In stable-baselines3 there is a class MlpExtractor which does - # what the Actor and Critic classes do here, just with the latent representation - # of the networks. Either I make a new class in a similar way and store the - # action distribution here, or I make the changes in Actor and Critic and change - # the distribution there. - if smooth_exploration: - self.action_dist = StateDependentNoiseDistribution( - num_actions, - num_actor_obs, - num_critic_obs, - ) - @property def action_mean(self): return self.actor.action_mean diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..430eca33 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -20,7 +20,7 @@ def __init__( ) super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py index 6e04b0e9..cd100053 100644 --- a/learning/modules/utils/gSDE.py +++ b/learning/modules/utils/gSDE.py @@ -26,7 +26,6 @@ def __init__( self, num_actions: int, num_actor_obs: int, - num_critic_obs: int, full_std: bool = True, use_expln: bool = False, learn_features: bool = False, @@ -34,7 +33,6 @@ def __init__( ): self.num_actions = num_actions self.num_actor_obs = num_actor_obs - self.num_critic_obs = num_critic_obs self.full_std = full_std self.use_expln = use_expln self.learn_features = learn_features diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index fa25d77b..057e724f 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -20,8 +20,9 @@ def create_MLP( # TODO[lm]: Could also create a separate function that gives the latent # reprentation used for smooth exploration (but if it doesn't mess up # anything, this is simpler) - if i == len(hidden_dims) - 1 and not latent: - add_layer(layers, hidden_dims[i], num_outputs) + if i == len(hidden_dims) - 1: + if not latent: + add_layer(layers, hidden_dims[i], num_outputs) else: add_layer( layers, From aea5d88fb736087108cf37e66aa0ea8093638514 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 16:25:05 -0500 Subject: [PATCH 04/42] moved everything to SmoothActor (it runs) --- learning/modules/actor.py | 41 ------- learning/modules/actor_critic.py | 3 +- learning/modules/smooth_actor.py | 130 ++++++++++++++++++++ learning/modules/utils/__init__.py | 1 - learning/modules/utils/gSDE.py | 172 --------------------------- learning/runners/on_policy_runner.py | 7 +- plots/plot.py | 10 +- 7 files changed, 140 insertions(+), 224 deletions(-) create mode 100644 learning/modules/smooth_actor.py delete mode 100644 learning/modules/utils/gSDE.py diff --git a/learning/modules/actor.py b/learning/modules/actor.py index c8ffe255..17cf0566 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -4,9 +4,6 @@ from .utils import create_MLP from .utils import export_network from .utils import RunningMeanStd -from .utils import StateDependentNoiseDistribution - -from gym import LEGGED_GYM_ROOT_DIR class Actor(nn.Module): @@ -85,41 +82,3 @@ def normalize(self, observation): def export(self, path): export_network(self, "policy", path, self.num_obs) - - -class SmoothActor(Actor): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Create latent NN - self.NN = create_MLP( - self.num_obs, - self.num_actions, - self.hidden_dims, - self.activation, - latent=True, - ) - # State dependent action distribution - self.distribution = StateDependentNoiseDistribution( - self.num_actions, - self.num_obs, - ) - # Noise for smooth exploration - self.episode_noise = None - # Debug mode for plotting - self.debug = False - - def act(self, observations): - # TODO[lm]: update distribution for gSDE - self.update_distribution(observations) - mean = self.distribution.mean - if self.episode_noise is None: - sample = self.distribution.sample() - self.episode_noise = sample - self.distribution.mean - else: - sample = mean + self.episode_noise - - if self.debug: - # write to csv (used for plotting) - with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: - f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") - return sample diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 770fdc71..29f6cdf7 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,6 +1,7 @@ import torch.nn as nn -from .actor import Actor, SmoothActor +from .actor import Actor +from .smooth_actor import SmoothActor from .critic import Critic diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py new file mode 100644 index 00000000..6c6a42c7 --- /dev/null +++ b/learning/modules/smooth_actor.py @@ -0,0 +1,130 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal + +from .actor import Actor +from .utils import create_MLP + +from gym import LEGGED_GYM_ROOT_DIR + + +# The following implementation is based on the gSDE paper. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +class SmoothActor(Actor): + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_mat: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + *args, + full_std: bool = True, + use_exp_ln: bool = False, + learn_features: bool = False, + epsilon: float = 1e-6, + log_std_init: float = -2.0, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.full_std = full_std + self.use_expln = use_exp_ln + self.learn_features = learn_features + self.epsilon = epsilon + self.log_std_init = log_std_init + + # Create latent NN and last layer + self.latent_net = create_MLP( + self.num_obs, + self.num_actions, + self.hidden_dims, + self.activation, + latent=True, + ) + self.latent_dim = self.hidden_dims[-1] + self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + # Reduce the number of parameters if needed + if self.full_std: + log_std = torch.ones(self.latent_dim, self.num_actions) + else: + log_std = torch.ones(self.latent_dim, 1) + self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) + # Sample an exploration matrix (this sets the exploration matrices) + self.sample_weights() + self.distribution = None + + # Debug mode for plotting + self.debug = False + + def sample_weights(self, batch_size=1): + # Sample weights for the noise exploration matrix + std = self.get_std() + self.weights_dist = Normal(torch.zeros_like(std), std) + # Reparametrization trick to pass gradients + self.exploration_mat = self.weights_dist.rsample() + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + def get_std(self): + # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally + if self.use_expln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(self.log_std) + + if self.full_std: + return std + assert self.latent_dim is not None + # Reduce the number of parameters: + return ( + torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std + ) + + def update_distribution(self, observations): + if self._normalize_obs: + observations = self.normalize(observations) + # Get latent features and compute distribution + self._latent_sde = self.latent_net(observations) + if not self.learn_features: + self._latent_sde = self._latent_sde.detach() + variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + mean_actions = self.mean_actions_net(self._latent_sde) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + + def act(self, observations): + self.update_distribution(observations) + mean = self.distribution.mean + sample = mean + self.get_noise() + if self.debug: + # write to csv (used for plotting) + with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: + f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + return sample + + def act_inference(self, observations): + if self._normalize_obs: + observations = self.normalize(observations) + latent_sde = self.latent_net(observations) + mean_actions = self.mean_actions_net(latent_sde) + return mean_actions + + def get_noise(self): + latent_sde = self._latent_sde + if not self.learn_features: + latent_sde = latent_sde.detach() + # Default case: only one exploration matrix + if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): + return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device)) + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices) + return noise.squeeze(dim=1) diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py index 3bc82600..1422b82d 100644 --- a/learning/modules/utils/__init__.py +++ b/learning/modules/utils/__init__.py @@ -1,3 +1,2 @@ from .neural_net import create_MLP, export_network from .normalize import RunningMeanStd -from .gSDE import StateDependentNoiseDistribution diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py deleted file mode 100644 index cd100053..00000000 --- a/learning/modules/utils/gSDE.py +++ /dev/null @@ -1,172 +0,0 @@ -from typing import Tuple, Optional - -import torch -import torch.nn as nn -from torch.distributions import Normal - - -# The following implementation was used in the gSDE paper on smooth exploration. -# See code: -# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 -# TODO[lm]: Need to update some of the naming conventions to fit our codebase. -class StateDependentNoiseDistribution: - """ - Distribution class for using generalized State Dependent Exploration (gSDE). - It is used to create the noise exploration matrix and compute the log - probability of an action with that noise. - """ - - latent_sde_dim: Optional[int] - weights_dist: Normal - _latent_sde: torch.Tensor - exploration_mat: torch.Tensor - exploration_matrices: torch.Tensor - - def __init__( - self, - num_actions: int, - num_actor_obs: int, - full_std: bool = True, - use_expln: bool = False, - learn_features: bool = False, - epsilon: float = 1e-6, - ): - self.num_actions = num_actions - self.num_actor_obs = num_actor_obs - self.full_std = full_std - self.use_expln = use_expln - self.learn_features = learn_features - self.epsilon = epsilon - self.mean_actions = None - self.log_std = None - - def get_std(self, log_std: torch.Tensor) -> torch.Tensor: - """ - Get the standard deviation from the learned parameter - (log of it by default). This ensures that the std is positive. - """ - if self.use_expln: - # From gSDE paper, it allows to keep variance - # above zero and prevent it from growing too fast - below_threshold = torch.exp(log_std) * (log_std <= 0) - # Avoid NaN: zeros values that are below zero - safe_log_std = log_std * (log_std > 0) + self.epsilon - above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0) - std = below_threshold + above_threshold - else: - # Use normal exponential - std = torch.exp(log_std) - - if self.full_std: - return std - assert self.latent_sde_dim is not None - # Reduce the number of parameters: - return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std - - def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: - """ - Sample weights for the noise exploration matrix, - using a centered Gaussian distribution. - """ - std = self.get_std(log_std) - self.weights_dist = Normal(torch.zeros_like(std), std) - # Reparametrization trick to pass gradients - self.exploration_mat = self.weights_dist.rsample() - # Pre-compute matrices in case of parallel exploration - self.exploration_matrices = self.weights_dist.rsample((batch_size,)) - - def get_distribution_net( - self, - latent_dim: int, - log_std_init: float = -2.0, - latent_sde_dim: Optional[int] = None, - ) -> Tuple[nn.Module, nn.Parameter]: - """ - Create the layers and parameter that represent the distribution: - one output will be the deterministic action, the other parameter will be the - standard deviation of the distribution that control the weights of the noise - matrix. - """ - # Network for the deterministic action, it represents the mean of the - # distribution - mean_actions_net = nn.Linear(latent_dim, self.action_dim) - # When we learn features for the noise, the feature dimension - # can be different between the policy and the noise network - self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim - # Reduce the number of parameters if needed - log_std = ( - torch.ones(self.latent_sde_dim, self.action_dim) - if self.full_std - else torch.ones(self.latent_sde_dim, 1) - ) - # Transform it to a parameter so it can be optimized - log_std = nn.Parameter(log_std * log_std_init, requires_grad=True) - # Sample an exploration matrix - self.sample_weights(log_std) - return mean_actions_net, log_std - - def get_distribution( - self, - mean_actions: torch.Tensor, - log_std: torch.Tensor, - latent_sde: torch.Tensor, - ) -> "StateDependentNoiseDistribution": - """ - Create the distribution given its parameters (mean, std) - """ - # Stop gradient if we don't want to influence the features - self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() - variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2) - self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) - return self - - def log_prob(self, actions: torch.Tensor): - # TODO[lm]: verify that summed correctly (stable-baselines3 is different) - return self.distribution.log_prob(actions).sum(-1) - - def entropy(self): - # TODO[lm]: verify that summed correctly (stable-baselines3 is different) - return self.distribution.entropy().sum(-1) - - def sample(self) -> torch.Tensor: - noise = self.get_noise(self._latent_sde) - actions = self.distribution.mean + noise - return actions - - def mode(self) -> torch.Tensor: - return self.distribution.mean - - def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor: - latent_sde = latent_sde if self.learn_features else latent_sde.detach() - # Default case: only one exploration matrix - if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): - return torch.mm(latent_sde, self.exploration_mat) - # Use batch matrix multiplication for efficient computation - # (batch_size, n_features) -> (batch_size, 1, n_features) - latent_sde = latent_sde.unsqueeze(dim=1) - # (batch_size, 1, n_actions) - noise = torch.bmm(latent_sde, self.exploration_matrices) - return noise.squeeze(dim=1) - - def actions_from_params( - self, - mean_actions: torch.Tensor, - log_std: torch.Tensor, - latent_sde: torch.Tensor, - deterministic: bool = False, - ) -> torch.Tensor: - # Update the distribution - self.get_distribution(mean_actions, log_std, latent_sde) - if deterministic: - return self.mode() - return self.sample() - - def log_prob_from_params( - self, - mean_actions: torch.Tensor, - log_std: torch.Tensor, - latent_sde: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - actions = self.actions_from_params(mean_actions, log_std, latent_sde) - log_prob = self.log_prob(actions) - return actions, log_prob diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 1cc3e0ef..4f68e164 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -34,13 +34,12 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + # * Re-sample noise matrix for smooth exploration + if self.policy_cfg["smooth_exploration"]: + self.alg.actor_critic.actor.sample_weights() # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - # Reset noise for smooth exploration - if self.policy_cfg["smooth_exploration"] and i == 0: - self.alg.actor_critic.actor.episode_noise = None - actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.policy_cfg["actions"], diff --git a/plots/plot.py b/plots/plot.py index 2a33eafa..adda11ce 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -5,13 +5,13 @@ name = "distribution_smooth" data = pd.read_csv(name + ".csv") -# Plot the data -n = 200 -plt.plot(data.iloc[:n, 0]) -plt.plot(data.iloc[:n, 1]) +# Plot the data (last n steps) +n = 500 +plt.plot(data.iloc[-n:, 0]) +plt.plot(data.iloc[-n:, 1]) plt.xlabel("timestep") plt.ylabel("action") -plt.title("Smoothing every rollout") +plt.title("gSDE") plt.legend(["mean", "sample"]) # plt.show() From 3603205cf8ab988e04af4e0b69412768b04b4607 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 18:21:56 -0500 Subject: [PATCH 05/42] possibly resample in PPO update --- learning/algorithms/ppo.py | 6 +++++- learning/modules/__init__.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 3ca5a5b4..ad31f846 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -34,7 +34,7 @@ import torch.nn as nn import torch.optim as optim -from learning.modules import ActorCritic +from learning.modules import ActorCritic, SmoothActor from learning.storage import RolloutStorage @@ -152,6 +152,10 @@ def update(self): old_mu_batch, old_sigma_batch, ) in generator: + # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it. + if isinstance(self.actor_critic.actor, SmoothActor): + batch_size = obs_batch.shape[0] + self.actor_critic.actor.sample_weights(batch_size) self.actor_critic.actor.update_distribution(obs_batch) actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 75cca5e2..86c96759 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -33,3 +33,4 @@ from .actor_critic import ActorCritic from .state_estimator import StateEstimatorNN from .actor import Actor +from .smooth_actor import SmoothActor From bb0a273982a5b90f3b97bbbfca4c8f7851fa5b63 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 4 Mar 2024 11:04:36 -0500 Subject: [PATCH 06/42] learn_features=True and correct sample dim --- learning/modules/smooth_actor.py | 4 ++-- learning/runners/on_policy_runner.py | 2 +- plots/plot.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 6c6a42c7..200f03f2 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -21,7 +21,7 @@ def __init__( *args, full_std: bool = True, use_exp_ln: bool = False, - learn_features: bool = False, + learn_features: bool = True, epsilon: float = 1e-6, log_std_init: float = -2.0, **kwargs, @@ -49,7 +49,7 @@ def __init__( else: log_std = torch.ones(self.latent_dim, 1) self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) - # Sample an exploration matrix (this sets the exploration matrices) + # Sample an exploration matrix self.sample_weights() self.distribution = None diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 4f68e164..e183b816 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -36,7 +36,7 @@ def learn(self): logger.tic("collection") # * Re-sample noise matrix for smooth exploration if self.policy_cfg["smooth_exploration"]: - self.alg.actor_critic.actor.sample_weights() + self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): diff --git a/plots/plot.py b/plots/plot.py index adda11ce..5de8ba35 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -10,7 +10,7 @@ plt.plot(data.iloc[-n:, 0]) plt.plot(data.iloc[-n:, 1]) plt.xlabel("timestep") -plt.ylabel("action") +plt.ylabel("action (NN output)") plt.title("gSDE") plt.legend(["mean", "sample"]) # plt.show() From 40025c43de53627d9ca615c0aa40d9b88476755d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 4 Mar 2024 12:01:29 -0500 Subject: [PATCH 07/42] adjust sample freq and update plotting --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 1 + learning/modules/actor.py | 16 +++++++++++++++- learning/modules/smooth_actor.py | 7 +++---- learning/runners/on_policy_runner.py | 10 +++++++--- plots/plot.py | 4 ++-- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 2f894bde..fda3f345 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -75,6 +75,7 @@ class policy(MiniCheetahRunnerCfg.policy): # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" smooth_exploration = True + exploration_sample_freq = 8 actor_obs = [ "base_ang_vel", diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 17cf0566..25af26e6 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,6 +5,8 @@ from .utils import export_network from .utils import RunningMeanStd +from gym import LEGGED_GYM_ROOT_DIR + class Actor(nn.Module): def __init__( @@ -42,6 +44,9 @@ def __init__( # disable args validation for speedup Normal.set_default_validate_args = False + # Debug mode for plotting + self.debug = False + @property def action_mean(self): return self.distribution.mean @@ -62,7 +67,12 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - return self.distribution.sample() + sample = self.distribution.sample() + if self.debug: + mean = self.distribution.mean + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv" + self.log_actions(mean[0][2], sample[0][2], path) + return sample def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) @@ -82,3 +92,7 @@ def normalize(self, observation): def export(self, path): export_network(self, "policy", path, self.num_obs) + + def log_actions(self, mean, sample, path): + with open(path, "a") as f: + f.write(str(mean.item()) + ", " + str(sample.item()) + "\n") diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 200f03f2..bc262798 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -54,7 +54,7 @@ def __init__( self.distribution = None # Debug mode for plotting - self.debug = False + self.debug = True def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix @@ -103,9 +103,8 @@ def act(self, observations): mean = self.distribution.mean sample = mean + self.get_noise() if self.debug: - # write to csv (used for plotting) - with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: - f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv" + self.log_actions(mean[0][2], sample[0][2], path) return sample def act_inference(self, observations): diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index e183b816..3b7ce33e 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -34,12 +34,16 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") - # * Re-sample noise matrix for smooth exploration - if self.policy_cfg["smooth_exploration"]: - self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.policy_cfg["actions"], diff --git a/plots/plot.py b/plots/plot.py index 5de8ba35..5018cb4d 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -6,12 +6,12 @@ data = pd.read_csv(name + ".csv") # Plot the data (last n steps) -n = 500 +n = 200 plt.plot(data.iloc[-n:, 0]) plt.plot(data.iloc[-n:, 1]) plt.xlabel("timestep") plt.ylabel("action (NN output)") -plt.title("gSDE") +plt.title("gSDE (sample_freq=8)") plt.legend(["mean", "sample"]) # plt.show() From c3017461e5ba7629e02dc75a5448fc4bf81e57c4 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 15 Mar 2024 14:17:22 -0400 Subject: [PATCH 08/42] update log_std_init=0.0 and refactor --- learning/modules/smooth_actor.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index bc262798..91754aeb 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -23,12 +23,12 @@ def __init__( use_exp_ln: bool = False, learn_features: bool = True, epsilon: float = 1e-6, - log_std_init: float = -2.0, + log_std_init: float = 0.0, **kwargs, ): super().__init__(*args, **kwargs) self.full_std = full_std - self.use_expln = use_exp_ln + self.use_exp_ln = use_exp_ln self.learn_features = learn_features self.epsilon = epsilon self.log_std_init = log_std_init @@ -54,7 +54,7 @@ def __init__( self.distribution = None # Debug mode for plotting - self.debug = True + self.debug = False def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix @@ -67,7 +67,7 @@ def sample_weights(self, batch_size=1): def get_std(self): # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally - if self.use_expln: + if self.use_exp_ln: # From gSDE paper, it allows to keep variance # above zero and prevent it from growing too fast below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) @@ -83,9 +83,7 @@ def get_std(self): return std assert self.latent_dim is not None # Reduce the number of parameters: - return ( - torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std - ) + return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std def update_distribution(self, observations): if self._normalize_obs: From a372a5274224c7b0bd18d6232452bb00cbec8778 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 3 Apr 2024 14:01:52 -0400 Subject: [PATCH 09/42] log joint data for training and play --- .gitignore | 6 +- gym/smooth_exploration/plot_play.py | 38 +++++++++ gym/smooth_exploration/plot_train.py | 55 +++++++++++++ learning/runners/on_policy_runner.py | 13 ++- plots/plot.py | 19 ----- scripts/log_play.py | 117 +++++++++++++++++++++++++++ scripts/log_train.py | 92 +++++++++++++++++++++ 7 files changed, 318 insertions(+), 22 deletions(-) create mode 100644 gym/smooth_exploration/plot_play.py create mode 100644 gym/smooth_exploration/plot_train.py delete mode 100644 plots/plot.py create mode 100644 scripts/log_play.py create mode 100644 scripts/log_train.py diff --git a/.gitignore b/.gitignore index d2761927..20093cee 100644 --- a/.gitignore +++ b/.gitignore @@ -16,8 +16,6 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ -plots/*.csv -plots/*.png # Byte-compiled / optimized / DLL files __pycache__/ @@ -83,3 +81,7 @@ ipython_config.py venv/ env.bak/ venv.bak/ + +# Smooth exploration +gym/smooth_exploration/data +gym/smooth_exploration/figures diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py new file mode 100644 index 00000000..eedee17f --- /dev/null +++ b/gym/smooth_exploration/plot_play.py @@ -0,0 +1,38 @@ +import numpy as np +import matplotlib.pyplot as plt + +name = "mini_cheetah_ref" +data_dir = "./data/play/" + name +fig_dir = "./figures/play" + +# load data +dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] +dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] +dof_vel = np.load(data_dir + "/dof_vel.npy")[0] + +# plot data +n_steps = 200 +fig, axs = plt.subplots(3, figsize=(10, 10)) +plt.suptitle(name) + +for i in range(3): + axs[0].plot(dof_pos_obs[:n_steps, i]) +axs[0].set_title("dof_pos_obs") +axs[0].legend(["idx 0", "idx 1", "idx 2"]) +axs[0].set_xlabel("time steps") + +for i in range(3): + axs[1].plot(dof_pos_target[:n_steps, i]) +axs[1].set_title("dof_pos_target") +axs[1].legend(["idx 0", "idx 1", "idx 2"]) +axs[1].set_xlabel("time steps") + +for i in range(3): + axs[2].plot(dof_vel[:n_steps, i]) +axs[2].set_title("dof_vel") +axs[2].legend(["idx 0", "idx 1", "idx 2"]) +axs[2].set_xlabel("time steps") + +plt.tight_layout() +plt.savefig(fig_dir + "/" + name + ".png") +plt.show() diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py new file mode 100644 index 00000000..8a45729f --- /dev/null +++ b/gym/smooth_exploration/plot_train.py @@ -0,0 +1,55 @@ +import numpy as np +import matplotlib.pyplot as plt + +SMOOTH = True +name = "ref_sample_8" +data_dir = "./data/train/" + name +fig_dir = "./figures/train" + +# load data +dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] +dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] +dof_vel = np.load(data_dir + "/dof_vel.npy")[0] +torques = np.load(data_dir + "/torques.npy")[0] + +iterations = range(0, dof_pos_obs.shape[0], 10) + +# plot data for each iteration +for it in iterations: + fig, axs = plt.subplots(4, figsize=(10, 10)) + plt.suptitle(name + " iteration " + str(it)) + + for i in range(3): + axs[0].plot(dof_pos_obs[it, :, i]) + axs[0].set_title("dof_pos_obs") + axs[0].legend(["idx 0", "idx 1", "idx 2"]) + axs[0].set_xlabel("time steps") + + for i in range(3): + axs[1].plot(dof_pos_target[it, :, i]) + axs[1].set_title("dof_pos_target") + axs[1].legend(["idx 0", "idx 1", "idx 2"]) + axs[1].set_xlabel("time steps") + + for i in range(3): + axs[2].plot(dof_vel[it, :, i]) + axs[2].set_title("dof_vel") + axs[2].legend(["idx 0", "idx 1", "idx 2"]) + axs[2].set_xlabel("time steps") + + for i in range(3): + axs[3].plot(torques[it, :, i]) + axs[3].set_title("torques") + axs[3].legend(["idx 0", "idx 1", "idx 2"]) + axs[3].set_xlabel("time steps") + + if SMOOTH: + # plot vertical lines where noise is resampled + for x in range(0, dof_pos_obs.shape[1], 8): + axs[0].axvline(x, color="r", linestyle="--") + axs[1].axvline(x, color="r", linestyle="--") + axs[2].axvline(x, color="r", linestyle="--") + axs[3].axvline(x, color="r", linestyle="--") + + plt.tight_layout() + plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9c167cd7..86541f9b 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -18,7 +18,7 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) - def learn(self): + def learn(self, states_to_log_dict=None): self.set_up_logger() rewards_dict = {} @@ -53,6 +53,17 @@ def learn(self): self.env.step() + # * Log states + # This continuously overwrites the rollout data, so when the state + # dict is written to a file it contains the last rollout for each + # iteration. + it_idx = self.it - 1 + if states_to_log_dict is not None: + for state in states_to_log_dict: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] + actor_obs = self.get_noisy_obs( self.policy_cfg["actor_obs"], self.policy_cfg["noise"] ) diff --git a/plots/plot.py b/plots/plot.py deleted file mode 100644 index 5018cb4d..00000000 --- a/plots/plot.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -# Read the CSV file -name = "distribution_smooth" -data = pd.read_csv(name + ".csv") - -# Plot the data (last n steps) -n = 200 -plt.plot(data.iloc[-n:, 0]) -plt.plot(data.iloc[-n:, 1]) -plt.xlabel("timestep") -plt.ylabel("action (NN output)") -plt.title("gSDE (sample_freq=8)") -plt.legend(["mean", "sample"]) -# plt.show() - -# Save the plot to a file -plt.savefig(name + ".png") diff --git a/scripts/log_play.py b/scripts/log_play.py new file mode 100644 index 00000000..adc1b598 --- /dev/null +++ b/scripts/log_play.py @@ -0,0 +1,117 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry +from gym.utils import KeyboardInterface +from gym.utils import VisualizationRecorder +from gym import LEGGED_GYM_ROOT_DIR + +# torch needs to be imported after isaacgym imports in local source +import torch +import os +import numpy as np + +TEST_TOTAL_TIMESTEPS = 1000 + + +def create_logging_dict(runner, test_total_timesteps): + states_to_log = [ + "dof_pos_target", + "dof_pos_obs", + "dof_vel", + "torques", + "commands", + # 'base_lin_vel', + # 'base_ang_vel', + # 'oscillators', + # 'grf', + # 'base_height' + ] + + states_to_log_dict = {} + + for state in states_to_log: + array_dim = runner.get_obs_size( + [ + state, + ] + ) + states_to_log_dict[state] = torch.zeros( + (1, test_total_timesteps, array_dim), device=runner.env.device + ) + return states_to_log_dict + + +def setup(args): + env_cfg, train_cfg = task_registry.create_cfgs(args) + env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) + if hasattr(env_cfg, "push_robots"): + env_cfg.push_robots.toggle = False + if hasattr(env_cfg, "commands"): + env_cfg.commands.resampling_time = 9999 + env_cfg.env.episode_length_s = 9999 + env_cfg.env.num_projectiles = 20 + task_registry.make_gym_and_sim() + env = task_registry.make_env(args.task, env_cfg) + env.cfg.init_state.reset_mode = "reset_to_basic" + train_cfg.runner.resume = True + train_cfg.logging.enable_local_saving = False + runner = task_registry.make_alg_runner(env, train_cfg) + + # * switch to evaluation mode (dropout for example) + runner.switch_to_eval() + return env, runner, train_cfg + + +def play(env, runner, train_cfg): + protocol_name = train_cfg.runner.experiment_name + + # * set up logging + log_file_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz" + ) + if not os.path.exists(os.path.dirname(log_file_path)): + os.makedirs(os.path.dirname(log_file_path)) + + states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS) + + # * set up recording + if env.cfg.viewer.record: + recorder = VisualizationRecorder( + env, train_cfg.runner.experiment_name, train_cfg.runner.load_run + ) + + # * set up interface: GamepadInterface(env) or KeyboardInterface(env) + COMMANDS_INTERFACE = hasattr(env, "commands") + if COMMANDS_INTERFACE: + # interface = GamepadInterface(env) + interface = KeyboardInterface(env) + + for t in range(TEST_TOTAL_TIMESTEPS): + if COMMANDS_INTERFACE: + interface.update(env) + if env.cfg.viewer.record: + recorder.update(t) + runner.set_actions( + runner.policy_cfg["actions"], + runner.get_inference_actions(), + runner.policy_cfg["disable_actions"], + ) + env.step() + env.check_exit() + + # * log + for state in states_to_log_dict: + states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :] + + # * save data + # first convert tensors to cpu + log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} + np.savez_compressed(log_file_path, **log_dict_cpu) + print("saved to ", log_file_path) + return states_to_log_dict + + +if __name__ == "__main__": + args = get_args() + with torch.no_grad(): + env, runner, train_cfg = setup(args) + play(env, runner, train_cfg) diff --git a/scripts/log_train.py b/scripts/log_train.py new file mode 100644 index 00000000..918d0b8f --- /dev/null +++ b/scripts/log_train.py @@ -0,0 +1,92 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils.logging_and_saving import wandb_singleton +from gym.utils.logging_and_saving import local_code_save_helper +from gym import LEGGED_GYM_ROOT_DIR + +# torch needs to be imported after isaacgym imports in local source +import torch +import os +import numpy as np + +TRAIN_ITERATIONS = 100 +ROLLOUT_TIMESTEPS = 32 + + +def create_logging_dict(runner): + states_to_log = [ + "dof_pos_target", + "dof_pos_obs", + "dof_vel", + "torques", + "commands", + # 'base_lin_vel', + # 'base_ang_vel', + # 'oscillators', + # 'grf', + # 'base_height' + ] + + states_to_log_dict = {} + + for state in states_to_log: + array_dim = runner.get_obs_size( + [ + state, + ] + ) + states_to_log_dict[state] = torch.zeros( + (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim), + device=runner.env.device, + ) + return states_to_log_dict + + +def setup(): + args = get_args() + wandb_helper = wandb_singleton.WandbSingleton() + + env_cfg, train_cfg = task_registry.create_cfgs(args) + task_registry.make_gym_and_sim() + wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) + env = task_registry.make_env(name=args.task, env_cfg=env_cfg) + randomize_episode_counters(env) + + train_cfg.runner.max_iterations = TRAIN_ITERATIONS + policy_runner = task_registry.make_alg_runner(env, train_cfg) + + local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) + + return train_cfg, policy_runner + + +def train(train_cfg, policy_runner): + protocol_name = train_cfg.runner.experiment_name + + # * set up logging + log_file_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz" + ) + if not os.path.exists(os.path.dirname(log_file_path)): + os.makedirs(os.path.dirname(log_file_path)) + + states_to_log_dict = create_logging_dict(policy_runner) + + # * train + # wandb_helper = wandb_singleton.WandbSingleton() + + policy_runner.learn(states_to_log_dict) + + # wandb_helper.close_wandb() + + # * save data + # first convert tensors to cpu + log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} + np.savez_compressed(log_file_path, **log_dict_cpu) + print("saved to ", log_file_path) + return states_to_log_dict + + +if __name__ == "__main__": + train_cfg, policy_runner = setup() + train(train_cfg=train_cfg, policy_runner=policy_runner) From 599b2cab114cbcbae72ead09ed682b4fa1b92ff7 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sat, 6 Apr 2024 18:05:29 -0400 Subject: [PATCH 10/42] update logging and plotting --- .gitignore | 4 +- gym/smooth_exploration/plot_play.py | 4 +- gym/smooth_exploration/plot_train.py | 86 ++++++++++++++++++++-------- learning/runners/on_policy_runner.py | 57 ++++++++++++++---- scripts/log_play.py | 6 +- scripts/log_train.py | 13 ++++- 6 files changed, 128 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index 20093cee..c78e6696 100644 --- a/.gitignore +++ b/.gitignore @@ -83,5 +83,5 @@ env.bak/ venv.bak/ # Smooth exploration -gym/smooth_exploration/data -gym/smooth_exploration/figures +gym/smooth_exploration/data* +gym/smooth_exploration/figures* diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py index eedee17f..5768f91a 100644 --- a/gym/smooth_exploration/plot_play.py +++ b/gym/smooth_exploration/plot_play.py @@ -2,8 +2,8 @@ import matplotlib.pyplot as plt name = "mini_cheetah_ref" -data_dir = "./data/play/" + name -fig_dir = "./figures/play" +data_dir = "./data_play/" + name +fig_dir = "./figures_play/" + name # load data dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py index 8a45729f..58c50396 100644 --- a/gym/smooth_exploration/plot_train.py +++ b/gym/smooth_exploration/plot_train.py @@ -1,55 +1,93 @@ import numpy as np import matplotlib.pyplot as plt +import os +FOURIER = True SMOOTH = True -name = "ref_sample_8" -data_dir = "./data/train/" + name -fig_dir = "./figures/train" +SAMPLE_FREQ = 16 +STEPS = 1000 + +name = "ref_sample_16_len_1000" +data_dir = "./data_train/" + name +fig_dir = "./figures_train/" + name + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) # load data dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] dof_vel = np.load(data_dir + "/dof_vel.npy")[0] torques = np.load(data_dir + "/torques.npy")[0] +terminated = np.load(data_dir + "/terminated.npy")[0] + + +# plot fourier trainsform +def plot_fourier(data, it): + fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10)) + for i in range(3): + ft = np.fft.fft(data[:, i]) + ft_half = ft[: len(ft) // 2] + axs_ft[0].plot(np.abs(ft_half)) + axs_ft[1].plot(np.angle(ft_half)) + + axs_ft[0].set_title("FT Amplitude") + axs_ft[0].set_xlabel("Frequency") + axs_ft[0].set_ylabel("Amplitude") + axs_ft[0].legend(["idx 0", "idx 1", "idx 2"]) + axs_ft[1].set_title("FT Phase") + axs_ft[1].set_xlabel("Frequency") + axs_ft[1].set_ylabel("Phase") + axs_ft[1].legend(["idx 0", "idx 1", "idx 2"]) + + fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png") -iterations = range(0, dof_pos_obs.shape[0], 10) # plot data for each iteration -for it in iterations: +for it in range(0, dof_pos_obs.shape[0], 10): + # check if iteration terminated + terminate_idx = np.where(terminated[it, :, 0] == 1)[0] + if terminate_idx.size > 0: + n_steps = terminate_idx[0] + else: + n_steps = dof_pos_obs.shape[1] + print(n_steps) + + # generate figure fig, axs = plt.subplots(4, figsize=(10, 10)) plt.suptitle(name + " iteration " + str(it)) + axs[0].set_title("dof_pos_obs") for i in range(3): - axs[0].plot(dof_pos_obs[it, :, i]) - axs[0].set_title("dof_pos_obs") - axs[0].legend(["idx 0", "idx 1", "idx 2"]) - axs[0].set_xlabel("time steps") + axs[0].plot(dof_pos_obs[it, :n_steps, i]) + axs[1].set_title("dof_pos_target") for i in range(3): - axs[1].plot(dof_pos_target[it, :, i]) - axs[1].set_title("dof_pos_target") - axs[1].legend(["idx 0", "idx 1", "idx 2"]) - axs[1].set_xlabel("time steps") + axs[1].plot(dof_pos_target[it, :n_steps, i]) + if FOURIER and n_steps == STEPS: + plot_fourier(dof_pos_target[it, :n_steps, :], it) + axs[2].set_title("dof_vel") for i in range(3): - axs[2].plot(dof_vel[it, :, i]) - axs[2].set_title("dof_vel") - axs[2].legend(["idx 0", "idx 1", "idx 2"]) - axs[2].set_xlabel("time steps") + axs[2].plot(dof_vel[it, :n_steps, i]) + axs[3].set_title("torques") for i in range(3): - axs[3].plot(torques[it, :, i]) - axs[3].set_title("torques") - axs[3].legend(["idx 0", "idx 1", "idx 2"]) - axs[3].set_xlabel("time steps") + axs[3].plot(torques[it, :n_steps, i]) + + # format plots + for idx in range(4): + axs[idx].legend(["idx 0", "idx 1", "idx 2"]) + axs[idx].set_xlabel("time steps") + axs[idx].set_xlim([0, n_steps]) if SMOOTH: # plot vertical lines where noise is resampled - for x in range(0, dof_pos_obs.shape[1], 8): + for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ): axs[0].axvline(x, color="r", linestyle="--") axs[1].axvline(x, color="r", linestyle="--") axs[2].axvline(x, color="r", linestyle="--") axs[3].axvline(x, color="r", linestyle="--") - plt.tight_layout() - plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") + fig.tight_layout() + fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 86541f9b..0729b994 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -34,6 +34,13 @@ def learn(self, states_to_log_dict=None): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): @@ -53,17 +60,6 @@ def learn(self, states_to_log_dict=None): self.env.step() - # * Log states - # This continuously overwrites the rollout data, so when the state - # dict is written to a file it contains the last rollout for each - # iteration. - it_idx = self.it - 1 - if states_to_log_dict is not None: - for state in states_to_log_dict: - states_to_log_dict[state][0, it_idx, i, :] = getattr( - self.env, state - )[0, :] - actor_obs = self.get_noisy_obs( self.policy_cfg["actor_obs"], self.policy_cfg["noise"] ) @@ -171,3 +167,42 @@ def init_storage(self): critic_obs_shape=[num_critic_obs], action_shape=[num_actions], ) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + with torch.inference_mode(): + for i in range(steps): + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] diff --git a/scripts/log_play.py b/scripts/log_play.py index adc1b598..f7fd68e8 100644 --- a/scripts/log_play.py +++ b/scripts/log_play.py @@ -66,7 +66,11 @@ def play(env, runner, train_cfg): # * set up logging log_file_path = os.path.join( - LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz" + LEGGED_GYM_ROOT_DIR, + "gym", + "smooth_exploration", + "data_play", + protocol_name + ".npz", ) if not os.path.exists(os.path.dirname(log_file_path)): os.makedirs(os.path.dirname(log_file_path)) diff --git a/scripts/log_train.py b/scripts/log_train.py index 918d0b8f..37fca0da 100644 --- a/scripts/log_train.py +++ b/scripts/log_train.py @@ -10,7 +10,7 @@ import numpy as np TRAIN_ITERATIONS = 100 -ROLLOUT_TIMESTEPS = 32 +ROLLOUT_TIMESTEPS = 1000 def create_logging_dict(runner): @@ -39,6 +39,11 @@ def create_logging_dict(runner): (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim), device=runner.env.device, ) + + # log whether rollout terminated + states_to_log_dict["terminated"] = torch.zeros( + (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device + ) return states_to_log_dict @@ -65,7 +70,11 @@ def train(train_cfg, policy_runner): # * set up logging log_file_path = os.path.join( - LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz" + LEGGED_GYM_ROOT_DIR, + "gym", + "smooth_exploration", + "data_train", + protocol_name + ".npz", ) if not os.path.exists(os.path.dirname(log_file_path)): os.makedirs(os.path.dirname(log_file_path)) From 4994550b747b289011c91eddcfd40d4d2f59acff Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 19 Apr 2024 16:20:08 -0400 Subject: [PATCH 11/42] plot FT script --- gym/smooth_exploration/plot_ft.py | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 gym/smooth_exploration/plot_ft.py diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py new file mode 100644 index 00000000..5ad86b63 --- /dev/null +++ b/gym/smooth_exploration/plot_ft.py @@ -0,0 +1,56 @@ +import numpy as np +import matplotlib.pyplot as plt +import os + +SAMPLE_FREQ = 8 +STEPS = 500 + +smooth_name = "ref_sample_8_len_500" +baseline_name = "ref_baseline_len_500" + +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name +fig_dir = "./figures_train/" + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + +# load data +smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] +baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] + +# compute FFT averages +smooth_ffts = [[], [], []] +baseline_ffts = [[], [], []] +for it in range(0, smooth_pos_target.shape[0], 10): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(3): + fft = np.fft.fft(smooth_pos_target[it, :, idx]) + smooth_ffts[idx].append(fft[: len(fft) // 2]) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(3): + fft = np.fft.fft(baseline_pos_target[it, :, idx]) + baseline_ffts[idx].append(fft[: len(fft) // 2]) + +print(f"Total smooth FFTS: {len(smooth_ffts[0])}") +print(f"Total baseline FFTS: {len(baseline_ffts[0])}") + +smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)] +baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)] + +# plot FFTs +fig, axs = plt.subplots(3, 1, figsize=(10, 10)) +for idx in range(3): + axs[idx].plot(np.abs(smooth_fft_means[idx])) + axs[idx].plot(np.abs(baseline_fft_means[idx])) + axs[idx].set_title(f"FT Amplitude idx {idx}") + axs[idx].set_xlabel("Frequency") + axs[idx].set_ylabel("Amplitude") + axs[idx].legend(["smooth", "baseline"]) + +fig.tight_layout() +fig.savefig(fig_dir + "/" + smooth_name + ".png") From 0205d99b33fc338c6ed3644ef55638fcc6bb1b5e Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 19 Apr 2024 18:47:36 -0400 Subject: [PATCH 12/42] added sweep config --- .../sweep_config_smooth_exploration.json | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 scripts/sweep_configs/sweep_config_smooth_exploration.json diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json new file mode 100644 index 00000000..952282fe --- /dev/null +++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json @@ -0,0 +1,19 @@ +{ + "method": "grid", + "metric": { + "name": "rewards/total_rewards", + "goal": "maximize" + }, + "run_cap": 10, + "parameters": { + "train_cfg.policy.exploration_sample_freq":{ + "values": [4, 8, 16, 32] + }, + "train_cfg.runner.num_steps_per_env":{ + "values": [32, 64] + }, + "train_cfg.runner.max_iterations":{ + "values": [800] + } + } +} \ No newline at end of file From 64b9b1446f4d63ca150db859d889b6d643a8b296 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 24 Apr 2024 13:56:26 -0400 Subject: [PATCH 13/42] update on policy and old policy runners, get nans for log_probs --- learning/algorithms/ppo2.py | 9 +++++++-- learning/modules/smooth_actor.py | 25 ++++++++++++++----------- learning/runners/BaseRunner.py | 7 +++++-- learning/runners/old_policy_runner.py | 18 ++++++++++++++++-- learning/runners/on_policy_runner.py | 12 +++++++----- 5 files changed, 49 insertions(+), 22 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..b6b4fec0 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -6,6 +6,7 @@ create_uniform_generator, compute_generalized_advantages, ) +from learning.modules import SmoothActor class PPO2: @@ -99,7 +100,7 @@ def update_actor(self, data): self.mean_surrogate_loss = 0 counter = 0 - self.actor.act(data["actor_obs"]) + self.actor.update_distribution(data["actor_obs"]) data["old_sigma_batch"] = self.actor.action_std.detach() data["old_mu_batch"] = self.actor.action_mean.detach() data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( @@ -111,8 +112,12 @@ def update_actor(self, data): batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: + # * Re-sample noise for smooth actor + if isinstance(self.actor, SmoothActor): + self.actor.sample_weights(batch_size) + # ! refactor how this is done - self.actor.act(batch["actor_obs"]) + self.actor.update_distribution(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 91754aeb..dae53c9e 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -13,14 +13,13 @@ class SmoothActor(Actor): weights_dist: Normal _latent_sde: torch.Tensor - exploration_mat: torch.Tensor exploration_matrices: torch.Tensor def __init__( self, *args, full_std: bool = True, - use_exp_ln: bool = False, + use_exp_ln: bool = True, learn_features: bool = True, epsilon: float = 1e-6, log_std_init: float = 0.0, @@ -60,8 +59,6 @@ def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix std = self.get_std() self.weights_dist = Normal(torch.zeros_like(std), std) - # Reparametrization trick to pass gradients - self.exploration_mat = self.weights_dist.rsample() # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size,)) @@ -87,12 +84,20 @@ def get_std(self): def update_distribution(self, observations): if self._normalize_obs: - observations = self.normalize(observations) + with torch.no_grad(): + observations = self.obs_rms(observations) # Get latent features and compute distribution self._latent_sde = self.latent_net(observations) if not self.learn_features: self._latent_sde = self._latent_sde.detach() - variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + if self._latent_sde.dim() == 2: + variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + elif self._latent_sde.dim() == 3: + variance = torch.einsum( + "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2 + ) + else: + raise ValueError("Invalid latent_sde dimension") mean_actions = self.mean_actions_net(self._latent_sde) self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) @@ -107,7 +112,8 @@ def act(self, observations): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) + with torch.no_grad(): + observations = self.obs_rms(observations) latent_sde = self.latent_net(observations) mean_actions = self.mean_actions_net(latent_sde) return mean_actions @@ -116,12 +122,9 @@ def get_noise(self): latent_sde = self._latent_sde if not self.learn_features: latent_sde = latent_sde.detach() - # Default case: only one exploration matrix - if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): - return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device)) # Use batch matrix multiplication for efficient computation # (batch_size, n_features) -> (batch_size, 1, n_features) latent_sde = latent_sde.unsqueeze(dim=1) # (batch_size, 1, n_actions) - noise = torch.bmm(latent_sde, self.exploration_matrices) + noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device)) return noise.squeeze(dim=1) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9e5ab2e3..39ce336d 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic +from learning.modules import Actor, Critic, SmoothActor from learning.utils import remove_zero_weighted_rewards @@ -22,7 +22,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) alg_class = eval(self.cfg["algorithm_class_name"]) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 42f019a3..8b58dbe4 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -4,7 +4,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner from learning.algorithms import PPO # noqa: F401 -from learning.modules import ActorCritic, Actor, Critic +from learning.modules import ActorCritic, Actor, Critic, SmoothActor logger = Logger() @@ -24,7 +24,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) actor_critic = ActorCritic(actor, critic) alg_class = eval(self.cfg["algorithm_class_name"]) @@ -42,6 +45,10 @@ def learn(self): self.save() + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -49,6 +56,13 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.actor_cfg["actions"], diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 3d38b48a..79f086d7 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -32,6 +32,10 @@ def learn(self, states_to_log_dict=None): tot_iter = self.it + self.num_learning_iterations self.save() + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) transition.update( @@ -65,11 +69,9 @@ def learn(self, states_to_log_dict=None): with torch.inference_mode(): for i in range(self.num_steps_per_env): # * Re-sample noise matrix for smooth exploration - sample_freq = self.policy_cfg["exploration_sample_freq"] - if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: - self.alg.actor_critic.actor.sample_weights( - batch_size=self.env.num_envs - ) + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) actions = self.alg.act(actor_obs, critic_obs) self.set_actions( From 91656e9b81d4a1722100d100337c63dd31d0cd56 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 25 Apr 2024 20:08:52 -0400 Subject: [PATCH 14/42] run 200 iterations before starting training, to burn in normalization --- learning/runners/on_policy_runner.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 79f086d7..fc754e17 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -54,6 +54,20 @@ def learn(self, states_to_log_dict=None): device=self.device, ) + # burn in observation normalization. + with torch.inference_mode(): + for _ in range(200): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") From f74e7feaa103c7ace373c55b9e95db0d0a21f38f Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 3 May 2024 14:37:30 -0400 Subject: [PATCH 15/42] update dummy input in export_network --- learning/modules/utils/neural_net.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 057e724f..c28762be 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -80,7 +80,7 @@ def export_network(network, network_name, path, num_inputs): model = copy.deepcopy(network).to("cpu") # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) From 18045e9289dec5f6d8c3cc74cacaf6b873489fe3 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 3 May 2024 18:50:12 -0400 Subject: [PATCH 16/42] good choice of params: sample 16, rollout 32, LR x1.1, des_kl 0.02 --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 8 ++++---- learning/algorithms/ppo2.py | 4 ++-- learning/runners/on_policy_runner.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 2e108f16..a6f12e18 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,7 @@ class actor: # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" smooth_exploration = True - exploration_sample_freq = 8 + exploration_sample_freq = 16 normalize_obs = True obs = [ @@ -151,17 +151,17 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_mini_batches = 4 storage_size = 2**17 # new mini_batch_size = 2**15 # new - learning_rate = 5.0e-5 + learning_rate = 0.002 schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 + desired_kl = 0.02 max_grad_norm = 1.0 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 500 # number of policy updates + max_iterations = 800 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index b6b4fec0..58945bb3 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -139,9 +139,9 @@ def update_actor(self, data): kl_mean = torch.mean(kl) if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) + self.learning_rate = max(2e-4, self.learning_rate / 1.1) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) + self.learning_rate = min(1e-2, self.learning_rate * 1.1) for param_group in self.optimizer.param_groups: # ! check this diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index fc754e17..1e885e81 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -167,7 +167,9 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) From adfb0788fb61883f20c15dbacc93ec9010b471ac Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sun, 5 May 2024 22:48:28 -0400 Subject: [PATCH 17/42] export latent network and update configs --- gym/envs/mini_cheetah/mini_cheetah_config.py | 24 +++++++++++-------- .../mini_cheetah/mini_cheetah_ref_config.py | 7 ++++-- learning/algorithms/ppo2.py | 13 ++++++++-- learning/modules/utils/neural_net.py | 10 +++++++- .../sweep_config_smooth_exploration.json | 3 --- 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 104e5f64..6631704c 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,16 +130,17 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = False + smooth_exploration = True + exploration_sample_freq = 16 obs = [ - "base_lin_vel", + # "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", "dof_pos_obs", "dof_vel", - "dof_pos_target", + # "dof_pos_target", ] actions = ["dof_pos_target"] add_noise = True @@ -195,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 + entropy_coef = 0.01 + num_learning_epochs = 6 # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed + num_mini_batches = 4 discount_horizon = 1.0 # [s] # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 + desired_kl = 0.02 max_grad_norm = 1.0 + # * Learning rate + learning_rate = 0.002 + schedule = "adaptive" # can be adaptive or fixed + lr_range = [2e-4, 1e-2] + lr_ratio = 1.3 class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" - max_iterations = 500 + max_iterations = 800 algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index a6f12e18..4f50f7d2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -151,13 +151,16 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_mini_batches = 4 storage_size = 2**17 # new mini_batch_size = 2**15 # new - learning_rate = 0.002 - schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 GAE_bootstrap_horizon = 2.0 # [s] desired_kl = 0.02 max_grad_norm = 1.0 + # * Learning rate + learning_rate = 0.002 + schedule = "adaptive" # can be adaptive or fixed + lr_range = [3e-4, 1e-2] + lr_ratio = 1.3 class runner(MiniCheetahRunnerCfg.runner): run_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 58945bb3..528ef882 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -27,6 +27,8 @@ def __init__( desired_kl=0.01, loss_fn="MSE", device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, **kwargs, ): self.device = device @@ -34,6 +36,8 @@ def __init__( self.desired_kl = desired_kl self.schedule = schedule self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio # * PPO components self.actor = actor.to(self.device) @@ -137,11 +141,16 @@ def update_actor(self, data): axis=-1, ) kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(2e-4, self.learning_rate / 1.1) + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.1) + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) for param_group in self.optimizer.param_groups: # ! check this diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index c28762be..24ed3df7 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -62,7 +62,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): layer_list.append(activation) -def export_network(network, network_name, path, num_inputs): +def export_network(network, network_name, path, num_inputs, latent=True): """ Thsi function traces and exports the given network module in .pt and .onnx file formats. These can be used for evaluation on other systems @@ -84,3 +84,11 @@ def export_network(network, network_name, path, num_inputs): model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) + + if latent: + path_latent = os.path.join(path, network_name + "_latent.onnx") + model_latent = model.latent_net + model_latent.eval() + dummy_input = torch.rand((num_inputs)) + model_traced = torch.jit.trace(model_latent, dummy_input) + torch.onnx.export(model_traced, dummy_input, path_latent) diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json index 952282fe..49fc4125 100644 --- a/scripts/sweep_configs/sweep_config_smooth_exploration.json +++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json @@ -11,9 +11,6 @@ }, "train_cfg.runner.num_steps_per_env":{ "values": [32, 64] - }, - "train_cfg.runner.max_iterations":{ - "values": [800] } } } \ No newline at end of file From 8ff82ae26b1e73daf0f72dc76af7756aab12de83 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 6 May 2024 11:13:00 -0400 Subject: [PATCH 18/42] export latent net with norm and log get_std --- learning/modules/smooth_actor.py | 9 ++++----- learning/modules/utils/neural_net.py | 2 +- learning/runners/on_policy_runner.py | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index dae53c9e..5cb1da89 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -57,11 +57,12 @@ def __init__( def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix - std = self.get_std() + std = self.get_std self.weights_dist = Normal(torch.zeros_like(std), std) # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + @property def get_std(self): # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally if self.use_exp_ln: @@ -91,11 +92,9 @@ def update_distribution(self, observations): if not self.learn_features: self._latent_sde = self._latent_sde.detach() if self._latent_sde.dim() == 2: - variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + variance = torch.mm(self._latent_sde**2, self.get_std**2) elif self._latent_sde.dim() == 3: - variance = torch.einsum( - "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2 - ) + variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2) else: raise ValueError("Invalid latent_sde dimension") mean_actions = self.mean_actions_net(self._latent_sde) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 24ed3df7..034c13d4 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -87,7 +87,7 @@ def export_network(network, network_name, path, num_inputs, latent=True): if latent: path_latent = os.path.join(path, network_name + "_latent.onnx") - model_latent = model.latent_net + model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) model_latent.eval() dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model_latent, dummy_input) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 1e885e81..dded2739 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -171,7 +171,9 @@ def set_up_logger(self): self.alg, ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + logger.register_category( + "actor", self.alg.actor, ["action_std", "entropy", "get_std"] + ) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) From 007ea933c6d72077c8582e44162a121490cc529d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 6 May 2024 14:22:30 -0400 Subject: [PATCH 19/42] export actor std to txt file --- learning/modules/utils/neural_net.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 034c13d4..6e6f41d6 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,6 +1,7 @@ import torch import os import copy +import numpy as np def create_MLP( @@ -86,9 +87,16 @@ def export_network(network, network_name, path, num_inputs, latent=True): torch.onnx.export(model_traced, dummy_input, path_onnx) if latent: + # Export latent model path_latent = os.path.join(path, network_name + "_latent.onnx") model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) model_latent.eval() dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model_latent, dummy_input) torch.onnx.export(model_traced, dummy_input, path_latent) + + # Save actor std of shape (num_actions, latent_dim) + # It is important that the shape is the same as the exploration matrix + path_std = os.path.join(path, network_name + "_std.txt") + std_transposed = model.get_std.numpy().T + np.savetxt(path_std, std_transposed) From 1ae4ac007fe312a3b4efbef3937e7ba9f3d8117a Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 5 Jul 2024 16:10:31 -0400 Subject: [PATCH 20/42] cleanup branch, remove logging and plotting --- .gitignore | 4 - gym/envs/mini_cheetah/mini_cheetah_config.py | 4 +- gym/smooth_exploration/plot_ft.py | 56 -------- gym/smooth_exploration/plot_play.py | 38 ------ gym/smooth_exploration/plot_train.py | 93 -------------- scripts/log_play.py | 121 ------------------ scripts/log_train.py | 101 --------------- .../sweep_config_smooth_exploration.json | 16 --- 8 files changed, 2 insertions(+), 431 deletions(-) delete mode 100644 gym/smooth_exploration/plot_ft.py delete mode 100644 gym/smooth_exploration/plot_play.py delete mode 100644 gym/smooth_exploration/plot_train.py delete mode 100644 scripts/log_play.py delete mode 100644 scripts/log_train.py delete mode 100644 scripts/sweep_configs/sweep_config_smooth_exploration.json diff --git a/.gitignore b/.gitignore index c78e6696..0b68375a 100644 --- a/.gitignore +++ b/.gitignore @@ -81,7 +81,3 @@ ipython_config.py venv/ env.bak/ venv.bak/ - -# Smooth exploration -gym/smooth_exploration/data* -gym/smooth_exploration/figures* diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 6631704c..2bd7958d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -134,13 +134,13 @@ class actor: exploration_sample_freq = 16 obs = [ - # "base_lin_vel", + "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", "dof_pos_obs", "dof_vel", - # "dof_pos_target", + "dof_pos_target", ] actions = ["dof_pos_target"] add_noise = True diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py deleted file mode 100644 index 5ad86b63..00000000 --- a/gym/smooth_exploration/plot_ft.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -import os - -SAMPLE_FREQ = 8 -STEPS = 500 - -smooth_name = "ref_sample_8_len_500" -baseline_name = "ref_baseline_len_500" - -smooth_data_dir = "./data_train/" + smooth_name -baseline_data_dir = "./data_train/" + baseline_name -fig_dir = "./figures_train/" - -if not os.path.exists(fig_dir): - os.makedirs(fig_dir) - -# load data -smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] -baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] -smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] -baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] - -# compute FFT averages -smooth_ffts = [[], [], []] -baseline_ffts = [[], [], []] -for it in range(0, smooth_pos_target.shape[0], 10): - # only use data that didn't terminate - if not np.any(smooth_terminated[it, :, 0]): - for idx in range(3): - fft = np.fft.fft(smooth_pos_target[it, :, idx]) - smooth_ffts[idx].append(fft[: len(fft) // 2]) - - if not np.any(baseline_terminated[it, :, 0]): - for idx in range(3): - fft = np.fft.fft(baseline_pos_target[it, :, idx]) - baseline_ffts[idx].append(fft[: len(fft) // 2]) - -print(f"Total smooth FFTS: {len(smooth_ffts[0])}") -print(f"Total baseline FFTS: {len(baseline_ffts[0])}") - -smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)] -baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)] - -# plot FFTs -fig, axs = plt.subplots(3, 1, figsize=(10, 10)) -for idx in range(3): - axs[idx].plot(np.abs(smooth_fft_means[idx])) - axs[idx].plot(np.abs(baseline_fft_means[idx])) - axs[idx].set_title(f"FT Amplitude idx {idx}") - axs[idx].set_xlabel("Frequency") - axs[idx].set_ylabel("Amplitude") - axs[idx].legend(["smooth", "baseline"]) - -fig.tight_layout() -fig.savefig(fig_dir + "/" + smooth_name + ".png") diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py deleted file mode 100644 index 5768f91a..00000000 --- a/gym/smooth_exploration/plot_play.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt - -name = "mini_cheetah_ref" -data_dir = "./data_play/" + name -fig_dir = "./figures_play/" + name - -# load data -dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] -dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] -dof_vel = np.load(data_dir + "/dof_vel.npy")[0] - -# plot data -n_steps = 200 -fig, axs = plt.subplots(3, figsize=(10, 10)) -plt.suptitle(name) - -for i in range(3): - axs[0].plot(dof_pos_obs[:n_steps, i]) -axs[0].set_title("dof_pos_obs") -axs[0].legend(["idx 0", "idx 1", "idx 2"]) -axs[0].set_xlabel("time steps") - -for i in range(3): - axs[1].plot(dof_pos_target[:n_steps, i]) -axs[1].set_title("dof_pos_target") -axs[1].legend(["idx 0", "idx 1", "idx 2"]) -axs[1].set_xlabel("time steps") - -for i in range(3): - axs[2].plot(dof_vel[:n_steps, i]) -axs[2].set_title("dof_vel") -axs[2].legend(["idx 0", "idx 1", "idx 2"]) -axs[2].set_xlabel("time steps") - -plt.tight_layout() -plt.savefig(fig_dir + "/" + name + ".png") -plt.show() diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py deleted file mode 100644 index 58c50396..00000000 --- a/gym/smooth_exploration/plot_train.py +++ /dev/null @@ -1,93 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -import os - -FOURIER = True -SMOOTH = True -SAMPLE_FREQ = 16 -STEPS = 1000 - -name = "ref_sample_16_len_1000" -data_dir = "./data_train/" + name -fig_dir = "./figures_train/" + name - -if not os.path.exists(fig_dir): - os.makedirs(fig_dir) - -# load data -dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] -dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] -dof_vel = np.load(data_dir + "/dof_vel.npy")[0] -torques = np.load(data_dir + "/torques.npy")[0] -terminated = np.load(data_dir + "/terminated.npy")[0] - - -# plot fourier trainsform -def plot_fourier(data, it): - fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10)) - for i in range(3): - ft = np.fft.fft(data[:, i]) - ft_half = ft[: len(ft) // 2] - axs_ft[0].plot(np.abs(ft_half)) - axs_ft[1].plot(np.angle(ft_half)) - - axs_ft[0].set_title("FT Amplitude") - axs_ft[0].set_xlabel("Frequency") - axs_ft[0].set_ylabel("Amplitude") - axs_ft[0].legend(["idx 0", "idx 1", "idx 2"]) - axs_ft[1].set_title("FT Phase") - axs_ft[1].set_xlabel("Frequency") - axs_ft[1].set_ylabel("Phase") - axs_ft[1].legend(["idx 0", "idx 1", "idx 2"]) - - fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png") - - -# plot data for each iteration -for it in range(0, dof_pos_obs.shape[0], 10): - # check if iteration terminated - terminate_idx = np.where(terminated[it, :, 0] == 1)[0] - if terminate_idx.size > 0: - n_steps = terminate_idx[0] - else: - n_steps = dof_pos_obs.shape[1] - print(n_steps) - - # generate figure - fig, axs = plt.subplots(4, figsize=(10, 10)) - plt.suptitle(name + " iteration " + str(it)) - - axs[0].set_title("dof_pos_obs") - for i in range(3): - axs[0].plot(dof_pos_obs[it, :n_steps, i]) - - axs[1].set_title("dof_pos_target") - for i in range(3): - axs[1].plot(dof_pos_target[it, :n_steps, i]) - if FOURIER and n_steps == STEPS: - plot_fourier(dof_pos_target[it, :n_steps, :], it) - - axs[2].set_title("dof_vel") - for i in range(3): - axs[2].plot(dof_vel[it, :n_steps, i]) - - axs[3].set_title("torques") - for i in range(3): - axs[3].plot(torques[it, :n_steps, i]) - - # format plots - for idx in range(4): - axs[idx].legend(["idx 0", "idx 1", "idx 2"]) - axs[idx].set_xlabel("time steps") - axs[idx].set_xlim([0, n_steps]) - - if SMOOTH: - # plot vertical lines where noise is resampled - for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ): - axs[0].axvline(x, color="r", linestyle="--") - axs[1].axvline(x, color="r", linestyle="--") - axs[2].axvline(x, color="r", linestyle="--") - axs[3].axvline(x, color="r", linestyle="--") - - fig.tight_layout() - fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") diff --git a/scripts/log_play.py b/scripts/log_play.py deleted file mode 100644 index f7fd68e8..00000000 --- a/scripts/log_play.py +++ /dev/null @@ -1,121 +0,0 @@ -from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry -from gym.utils import KeyboardInterface -from gym.utils import VisualizationRecorder -from gym import LEGGED_GYM_ROOT_DIR - -# torch needs to be imported after isaacgym imports in local source -import torch -import os -import numpy as np - -TEST_TOTAL_TIMESTEPS = 1000 - - -def create_logging_dict(runner, test_total_timesteps): - states_to_log = [ - "dof_pos_target", - "dof_pos_obs", - "dof_vel", - "torques", - "commands", - # 'base_lin_vel', - # 'base_ang_vel', - # 'oscillators', - # 'grf', - # 'base_height' - ] - - states_to_log_dict = {} - - for state in states_to_log: - array_dim = runner.get_obs_size( - [ - state, - ] - ) - states_to_log_dict[state] = torch.zeros( - (1, test_total_timesteps, array_dim), device=runner.env.device - ) - return states_to_log_dict - - -def setup(args): - env_cfg, train_cfg = task_registry.create_cfgs(args) - env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) - if hasattr(env_cfg, "push_robots"): - env_cfg.push_robots.toggle = False - if hasattr(env_cfg, "commands"): - env_cfg.commands.resampling_time = 9999 - env_cfg.env.episode_length_s = 9999 - env_cfg.env.num_projectiles = 20 - task_registry.make_gym_and_sim() - env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" - train_cfg.runner.resume = True - train_cfg.logging.enable_local_saving = False - runner = task_registry.make_alg_runner(env, train_cfg) - - # * switch to evaluation mode (dropout for example) - runner.switch_to_eval() - return env, runner, train_cfg - - -def play(env, runner, train_cfg): - protocol_name = train_cfg.runner.experiment_name - - # * set up logging - log_file_path = os.path.join( - LEGGED_GYM_ROOT_DIR, - "gym", - "smooth_exploration", - "data_play", - protocol_name + ".npz", - ) - if not os.path.exists(os.path.dirname(log_file_path)): - os.makedirs(os.path.dirname(log_file_path)) - - states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS) - - # * set up recording - if env.cfg.viewer.record: - recorder = VisualizationRecorder( - env, train_cfg.runner.experiment_name, train_cfg.runner.load_run - ) - - # * set up interface: GamepadInterface(env) or KeyboardInterface(env) - COMMANDS_INTERFACE = hasattr(env, "commands") - if COMMANDS_INTERFACE: - # interface = GamepadInterface(env) - interface = KeyboardInterface(env) - - for t in range(TEST_TOTAL_TIMESTEPS): - if COMMANDS_INTERFACE: - interface.update(env) - if env.cfg.viewer.record: - recorder.update(t) - runner.set_actions( - runner.policy_cfg["actions"], - runner.get_inference_actions(), - runner.policy_cfg["disable_actions"], - ) - env.step() - env.check_exit() - - # * log - for state in states_to_log_dict: - states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :] - - # * save data - # first convert tensors to cpu - log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} - np.savez_compressed(log_file_path, **log_dict_cpu) - print("saved to ", log_file_path) - return states_to_log_dict - - -if __name__ == "__main__": - args = get_args() - with torch.no_grad(): - env, runner, train_cfg = setup(args) - play(env, runner, train_cfg) diff --git a/scripts/log_train.py b/scripts/log_train.py deleted file mode 100644 index 37fca0da..00000000 --- a/scripts/log_train.py +++ /dev/null @@ -1,101 +0,0 @@ -from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry, randomize_episode_counters -from gym.utils.logging_and_saving import wandb_singleton -from gym.utils.logging_and_saving import local_code_save_helper -from gym import LEGGED_GYM_ROOT_DIR - -# torch needs to be imported after isaacgym imports in local source -import torch -import os -import numpy as np - -TRAIN_ITERATIONS = 100 -ROLLOUT_TIMESTEPS = 1000 - - -def create_logging_dict(runner): - states_to_log = [ - "dof_pos_target", - "dof_pos_obs", - "dof_vel", - "torques", - "commands", - # 'base_lin_vel', - # 'base_ang_vel', - # 'oscillators', - # 'grf', - # 'base_height' - ] - - states_to_log_dict = {} - - for state in states_to_log: - array_dim = runner.get_obs_size( - [ - state, - ] - ) - states_to_log_dict[state] = torch.zeros( - (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim), - device=runner.env.device, - ) - - # log whether rollout terminated - states_to_log_dict["terminated"] = torch.zeros( - (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device - ) - return states_to_log_dict - - -def setup(): - args = get_args() - wandb_helper = wandb_singleton.WandbSingleton() - - env_cfg, train_cfg = task_registry.create_cfgs(args) - task_registry.make_gym_and_sim() - wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) - env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) - - train_cfg.runner.max_iterations = TRAIN_ITERATIONS - policy_runner = task_registry.make_alg_runner(env, train_cfg) - - local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) - - return train_cfg, policy_runner - - -def train(train_cfg, policy_runner): - protocol_name = train_cfg.runner.experiment_name - - # * set up logging - log_file_path = os.path.join( - LEGGED_GYM_ROOT_DIR, - "gym", - "smooth_exploration", - "data_train", - protocol_name + ".npz", - ) - if not os.path.exists(os.path.dirname(log_file_path)): - os.makedirs(os.path.dirname(log_file_path)) - - states_to_log_dict = create_logging_dict(policy_runner) - - # * train - # wandb_helper = wandb_singleton.WandbSingleton() - - policy_runner.learn(states_to_log_dict) - - # wandb_helper.close_wandb() - - # * save data - # first convert tensors to cpu - log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} - np.savez_compressed(log_file_path, **log_dict_cpu) - print("saved to ", log_file_path) - return states_to_log_dict - - -if __name__ == "__main__": - train_cfg, policy_runner = setup() - train(train_cfg=train_cfg, policy_runner=policy_runner) diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json deleted file mode 100644 index 49fc4125..00000000 --- a/scripts/sweep_configs/sweep_config_smooth_exploration.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "method": "grid", - "metric": { - "name": "rewards/total_rewards", - "goal": "maximize" - }, - "run_cap": 10, - "parameters": { - "train_cfg.policy.exploration_sample_freq":{ - "values": [4, 8, 16, 32] - }, - "train_cfg.runner.num_steps_per_env":{ - "values": [32, 64] - } - } -} \ No newline at end of file From 70da359c9f20f908138fe50eabf7019c8b05d7d1 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 8 Jul 2024 15:01:55 -0400 Subject: [PATCH 21/42] PPO, PPO-smooth and IPG all learn (IPG-smooth kind of) --- .../mini_cheetah/mini_cheetah_ref_config.py | 29 +- learning/algorithms/__init__.py | 1 + learning/algorithms/ppo2.py | 2 +- learning/algorithms/ppo_ipg.py | 271 +++++++++++++++++ learning/runners/__init__.py | 3 +- learning/runners/ipg_runner.py | 274 ++++++++++++++++++ learning/runners/on_policy_runner.py | 84 ++---- learning/storage/__init__.py | 3 +- learning/storage/replay_buffer.py | 54 ++++ learning/utils/__init__.py | 5 +- learning/utils/utils.py | 6 + 11 files changed, 650 insertions(+), 82 deletions(-) create mode 100644 learning/algorithms/ppo_ipg.py create mode 100644 learning/runners/ipg_runner.py create mode 100644 learning/storage/replay_buffer.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 4f50f7d2..eba8c34a 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,13 +67,13 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "IPGRunner" class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = True + smooth_exploration = False exploration_sample_freq = 16 normalize_obs = True @@ -142,6 +142,8 @@ class termination_weight: class algorithm(MiniCheetahRunnerCfg.algorithm): # training params + gamma = 0.99 + lam = 0.95 value_loss_coef = 1.0 # deprecate for PPO2 use_clipped_value_loss = True # deprecate for PPO2 clip_param = 0.2 @@ -149,22 +151,25 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_learning_epochs = 6 # mini batch size = num_envs*nsteps/nminibatches num_mini_batches = 4 - storage_size = 2**17 # new mini_batch_size = 2**15 # new - discount_horizon = 1.0 # [s] - lam = 0.95 - GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.02 + + desired_kl = 0.02 # 0.02 for smooth-expl, else 0.01 max_grad_norm = 1.0 - # * Learning rate - learning_rate = 0.002 + learning_rate = 0.002 # initial value schedule = "adaptive" # can be adaptive or fixed lr_range = [3e-4, 1e-2] - lr_ratio = 1.3 + lr_ratio = 1.3 # 1.3 for smooth-expl, else 1.5 + + # IPG + storage_size = 4 * 32 * 4096 # old_policies*nsteps*num_envs + polyak = 0.995 + use_cv = False + inter_nu = 0.2 + beta = "off_policy" class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 800 # number of policy updates - algorithm_class_name = "PPO2" + max_iterations = 500 # number of policy updates + algorithm_class_name = "PPO_IPG" num_steps_per_env = 32 diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index ace1b9fe..b775a439 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -32,4 +32,5 @@ from .ppo import PPO from .ppo2 import PPO2 +from .ppo_ipg import PPO_IPG from .SE import StateEstimator diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 528ef882..e5912e3f 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -63,7 +63,7 @@ def switch_to_train(self): self.actor.train() self.critic.train() - def act(self, obs, critic_obs): + def act(self, obs): return self.actor.act(obs).detach() def update(self, data, last_obs=None): diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py new file mode 100644 index 00000000..ba92b575 --- /dev/null +++ b/learning/algorithms/ppo_ipg.py @@ -0,0 +1,271 @@ +import torch +import torch.nn as nn +import torch.optim as optim + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, + polyak_update, +) +from learning.modules import SmoothActor + + +class PPO_IPG: + def __init__( + self, + actor, + critic_v, + critic_q, + target_critic_q, + num_learning_epochs=1, + num_mini_batches=1, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + polyak=0.995, + use_cv=False, + inter_nu=0.2, + beta="off_policy", + device="cpu", + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + + # * PPO components + self.actor = actor.to(self.device) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic_v = critic_v.to(self.device) + self.critic_v_optimizer = optim.Adam( + self.critic_v.parameters(), lr=learning_rate + ) + + # * IPG components + self.critic_q = critic_q.to(self.device) + self.critic_q_optimizer = optim.Adam( + self.critic_q.parameters(), lr=learning_rate + ) + self.target_critic_q = target_critic_q.to(self.device) + self.target_critic_q.load_state_dict(self.critic_q.state_dict()) + + # * PPO parameters + self.clip_param = clip_param + self.num_learning_epochs = num_learning_epochs + self.num_mini_batches = num_mini_batches + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + # * IPG parameters + self.polyak = polyak + self.use_cv = use_cv + self.inter_nu = inter_nu + self.beta = beta + + def switch_to_train(self): + self.actor.train() + self.critic_v.train() + self.critic_q.train() + + def act(self, obs): + return self.actor.act(obs).detach() + + def update(self, data_onpol, data_offpol, last_obs=None): + # On-policy GAE + if last_obs is None: + last_values = None + else: + with torch.no_grad(): + last_values = self.critic_v.evaluate(last_obs).detach() + compute_generalized_advantages( + data_onpol, self.gamma, self.lam, self.critic_v, last_values + ) + + self.update_critic_q(data_offpol) + + # Freeze Q-network params + for param in self.critic_q.parameters(): + param.requires_grad = False + + self.update_actor(data_onpol, data_offpol) + self.update_critic_v(data_onpol) + + # Un-freeze Q-network params + for param in self.critic_q.parameters(): + param.requires_grad = True + + # print("On-policy data shape: ", data_onpol.shape) + # print("Off-policy data shape: ", data_offpol.shape) + + def update_critic_q(self, data): + self.mean_q_loss = 0 + counter = 0 + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + with torch.no_grad(): + # TODO: check that should be inference + action_next = self.actor.act_inference(batch["next_actor_obs"]) + q_input_next = torch.cat( + (batch["next_critic_obs"], action_next), dim=-1 + ) + q_next = self.target_critic_q.evaluate(q_input_next) + q_target = ( + batch["rewards"] + + self.gamma * batch["dones"].logical_not() * q_next + ) + q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) + q_value = self.critic_q.evaluate(q_input) + q_loss = self.critic_q.loss_fn(q_value, q_target) + self.critic_q_optimizer.zero_grad() + q_loss.backward() + nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm) + self.critic_q_optimizer.step() + self.mean_q_loss += q_loss.item() + counter += 1 + + # TODO: check where to do polyak update (IPG repo does it here) + self.target_critic_q = polyak_update( + self.critic_q, self.target_critic_q, self.polyak + ) + self.mean_q_loss /= counter + + def update_critic_v(self, data): + self.mean_value_loss = 0 + counter = 0 + + n, m = data.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + for batch in generator: + value_batch = self.critic_v.evaluate(batch["critic_obs"]) + value_loss = self.critic_v.loss_fn(value_batch, batch["returns"]) + self.critic_v_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic_v.parameters(), self.max_grad_norm) + self.critic_v_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data_onpol, data_offpol): + self.mean_surrogate_loss = 0 + self.mean_offpol_loss = 0 + counter = 0 + + self.actor.update_distribution(data_onpol["actor_obs"]) + data_onpol["old_sigma"] = self.actor.action_std.detach() + data_onpol["old_mu"] = self.actor.action_mean.detach() + data_onpol["old_actions_log_prob"] = self.actor.get_actions_log_prob( + data_onpol["actions"] + ).detach() + + # Generate off-policy batches and use all on-policy data + n, m = data_offpol.shape + total_data = n * m + batch_size = total_data // self.num_mini_batches + generator_offpol = create_uniform_generator( + data_offpol, batch_size, self.num_learning_epochs + ) + for batch_offpol in generator_offpol: + # * Re-sample noise for smooth actor + if isinstance(self.actor, SmoothActor): + self.actor.sample_weights(batch_size) + + self.actor.update_distribution(data_onpol["actor_obs"]) + actions_log_prob_onpol = self.actor.get_actions_log_prob( + data_onpol["actions"] + ) + mu_onpol = self.actor.action_mean + sigma_onpol = self.actor.action_std + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_onpol / data_onpol["old_sigma"] + 1.0e-5) + + ( + torch.square(data_onpol["old_sigma"]) + + torch.square(data_onpol["old_mu"] - mu_onpol) + ) + / (2.0 * torch.square(sigma_onpol)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max(1e-5, self.learning_rate / 1.5) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min(1e-2, self.learning_rate * 1.5) + + for param_group in self.optimizer.param_groups: + # ! check this + param_group["lr"] = self.learning_rate + + # * On-policy surrogate loss + adv_onpol = data_onpol["advantages"] + if self.use_cv: + # TODO: control variate + critic_based_adv = 0 # get_control_variate(data_onpol, self.critic_v) + learning_signals = (adv_onpol - critic_based_adv) * (1 - self.inter_nu) + else: + learning_signals = adv_onpol * (1 - self.inter_nu) + + ratio = torch.exp( + actions_log_prob_onpol - data_onpol["old_actions_log_prob"] + ) + ratio_clipped = torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate = -learning_signals * ratio + surrogate_clipped = -learning_signals * ratio_clipped + loss_onpol = torch.max(surrogate, surrogate_clipped).mean() + + # * Off-policy loss + if self.beta == "on_policy": + loss_offpol = self.compute_loss_offpol(data_onpol) + elif self.beta == "off_policy": + loss_offpol = self.compute_loss_offpol(batch_offpol) + else: + raise ValueError(f"Invalid beta value: {self.beta}") + + if self.use_cv: + b = 1 + else: + b = self.inter_nu + + loss = loss_onpol + b * loss_offpol + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.optimizer.step() + self.mean_surrogate_loss += loss_onpol.item() + self.mean_offpol_loss += loss_offpol.item() + counter += 1 + self.mean_surrogate_loss /= counter + self.mean_offpol_loss /= counter + + def compute_loss_offpol(self, data): + obs = data["actor_obs"] + actions = self.actor.act_inference(obs) + q_input = torch.cat((data["critic_obs"], actions), dim=-1) + q_value = self.critic_q.evaluate(q_input) + return -q_value.mean() diff --git a/learning/runners/__init__.py b/learning/runners/__init__.py index b0f49217..a2cc7576 100644 --- a/learning/runners/__init__.py +++ b/learning/runners/__init__.py @@ -32,4 +32,5 @@ from .on_policy_runner import OnPolicyRunner from .my_runner import MyRunner -from .old_policy_runner import OldPolicyRunner \ No newline at end of file +from .old_policy_runner import OldPolicyRunner +from .ipg_runner import IPGRunner \ No newline at end of file diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py new file mode 100644 index 00000000..ecc8a39b --- /dev/null +++ b/learning/runners/ipg_runner.py @@ -0,0 +1,274 @@ +import os +import torch +import torch.nn as nn +from tensordict import TensorDict + +from learning.utils import Logger + +from .BaseRunner import BaseRunner +from learning.modules import Actor, SmoothActor, Critic +from learning.storage import DictStorage, ReplayBuffer +from learning.algorithms import PPO_IPG + +logger = Logger() +storage_onpol = DictStorage() +storage_offpol = ReplayBuffer() + + +class IPGRunner(BaseRunner): + def __init__(self, env, train_cfg, device="cpu"): + super().__init__(env, train_cfg, device) + logger.initialize( + self.env.num_envs, + self.env.dt, + self.cfg["max_iterations"], + self.device, + ) + + def _set_up_alg(self): + alg_class = eval(self.cfg["algorithm_class_name"]) + if alg_class != PPO_IPG: + raise ValueError("IPGRunner only supports PPO_IPG") + + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic_v = Critic(num_critic_obs, **self.critic_cfg) + critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) + target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) + self.alg = alg_class( + actor, + critic_v, + critic_q, + target_critic_q, + device=self.device, + **self.alg_cfg, + ) + + def learn(self): + self.set_up_logger() + + rewards_dict = {} + + self.alg.switch_to_train() + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + tot_iter = self.it + self.num_learning_iterations + self.save() + + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + + # * start up both on- and off-policy storage + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + storage_onpol.initialize( + transition, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, + device=self.device, + ) + storage_offpol.initialize( + transition, + self.env.num_envs, + self.alg_cfg["storage_size"], + device=self.device, + ) + + # burn in observation normalization. + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() + + logger.tic("runtime") + for self.it in range(self.it + 1, tot_iter + 1): + logger.tic("iteration") + logger.tic("collection") + # * Rollout + with torch.inference_mode(): + for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + # add transition to both storages + storage_onpol.add_transitions(transition) + storage_offpol.add_transitions(transition) + + logger.log_rewards(rewards_dict) + logger.log_rewards({"total_rewards": total_rewards}) + logger.finish_step(dones) + logger.toc("collection") + + logger.tic("learning") + self.alg.update(storage_onpol.data, storage_offpol.get_data()) + storage_onpol.clear() # only clear on-policy storage + logger.toc("learning") + logger.log_all_categories() + + logger.finish_iteration() + logger.toc("iteration") + logger.toc("runtime") + logger.print_to_terminal() + + if self.it % self.save_interval == 0: + self.save() + self.save() + + @torch.no_grad + def burn_in_normalization(self, n_iterations=200): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic_v.evaluate(critic_obs) + q_input = torch.cat((critic_obs, actions), dim=-1) + self.alg.critic_q.evaluate(q_input) + self.alg.target_critic_q.evaluate(q_input) + # self.env.reset() + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def set_up_logger(self): + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) + logger.register_rewards( + list(self.critic_cfg["reward"]["termination_weight"].keys()) + ) + logger.register_rewards(["total_rewards"]) + logger.register_category( + "algorithm", + self.alg, + [ + "mean_value_loss", + "mean_surrogate_loss", + "learning_rate", + # IPG specific + "mean_q_loss", + "mean_offpol_loss", + ], + ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + + logger.attach_torch_obj_to_wandb( + (self.alg.actor, self.alg.critic_v, self.alg.critic_q) + ) + + def save(self): + os.makedirs(self.log_dir, exist_ok=True) + path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_v_state_dict": self.alg.critic_v.state_dict(), + "critic_q_state_dict": self.alg.critic_q.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(), + "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(), + "iter": self.it, + }, + path, + ) + + def load(self, path, load_optimizer=True, load_actor_std=True): + loaded_dict = torch.load(path) + if load_actor_std: + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + else: + std_init = self.alg.actor.std.detach().clone() + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.actor.std = nn.Parameter(std_init) + self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"]) + self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_v_optimizer.load_state_dict( + loaded_dict["critic_v_opt_state_dict"] + ) + self.alg.critic_q_optimizer.load_state_dict( + loaded_dict["critic_q_opt_state_dict"] + ) + self.it = loaded_dict["iter"] + + def switch_to_eval(self): + self.alg.actor.eval() + self.alg.critic_v.eval() + self.alg.critic_q.eval() + + def get_inference_actions(self): + obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) + return self.alg.actor.act_inference(obs) + + def export(self, path): + self.alg.actor.export(path) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index dded2739..e5f000f8 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -41,7 +41,7 @@ def learn(self, states_to_log_dict=None): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), @@ -55,30 +55,13 @@ def learn(self, states_to_log_dict=None): ) # burn in observation normalization. - with torch.inference_mode(): - for _ in range(200): - actions = self.alg.act(actor_obs, critic_obs) - self.set_actions( - self.actor_cfg["actions"], - actions, - self.actor_cfg["disable_actions"], - ) - self.env.step() - actor_obs = self.get_noisy_obs( - self.actor_cfg["obs"], self.actor_cfg["noise"] - ) + if self.actor_cfg["normalize_obs"] or self.critic_cfg["normalize_obs"]: + self.burn_in_normalization() logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") - - # * Simulate environment and log states - if states_to_log_dict is not None: - it_idx = self.it - 1 - if it_idx % 10 == 0: - self.sim_and_log_states(states_to_log_dict, it_idx) - # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): @@ -87,7 +70,7 @@ def learn(self, states_to_log_dict=None): if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: self.alg.actor.sample_weights(batch_size=self.env.num_envs) - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -146,6 +129,22 @@ def learn(self, states_to_log_dict=None): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=200): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + # critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + # TODO: check this, trains better without evaluating critic + # critic_obs = self.get_obs(self.critic_cfg["obs"]) + # self.alg.critic.evaluate(critic_obs) + # self.env.reset() + def update_rewards(self, rewards_dict, terminated): rewards_dict.update( self.get_rewards( @@ -171,9 +170,7 @@ def set_up_logger(self): self.alg, ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) - logger.register_category( - "actor", self.alg.actor, ["action_std", "entropy", "get_std"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @@ -212,42 +209,3 @@ def get_inference_actions(self): def export(self, path): self.alg.actor.export(path) - - def sim_and_log_states(self, states_to_log_dict, it_idx): - # Simulate environment for as many steps as expected in the dict. - # Log states to the dict, as well as whether the env terminated. - steps = states_to_log_dict["terminated"].shape[2] - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) - - with torch.inference_mode(): - for i in range(steps): - sample_freq = self.policy_cfg["exploration_sample_freq"] - if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: - self.alg.actor_critic.actor.sample_weights( - batch_size=self.env.num_envs - ) - - actions = self.alg.act(actor_obs, critic_obs) - self.set_actions( - self.policy_cfg["actions"], - actions, - self.policy_cfg["disable_actions"], - ) - - self.env.step() - - actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] - ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) - - # Log states (just for the first env) - terminated = self.get_terminated()[0] - for state in states_to_log_dict: - if state == "terminated": - states_to_log_dict[state][0, it_idx, i, :] = terminated - else: - states_to_log_dict[state][0, it_idx, i, :] = getattr( - self.env, state - )[0, :] diff --git a/learning/storage/__init__.py b/learning/storage/__init__.py index f10df85f..47f2593b 100644 --- a/learning/storage/__init__.py +++ b/learning/storage/__init__.py @@ -3,4 +3,5 @@ from .rollout_storage import RolloutStorage from .SE_storage import SERolloutStorage -from .dict_storage import DictStorage \ No newline at end of file +from .dict_storage import DictStorage +from .replay_buffer import ReplayBuffer \ No newline at end of file diff --git a/learning/storage/replay_buffer.py b/learning/storage/replay_buffer.py new file mode 100644 index 00000000..350a2c25 --- /dev/null +++ b/learning/storage/replay_buffer.py @@ -0,0 +1,54 @@ +import torch +from tensordict import TensorDict + + +class ReplayBuffer: + def __init__(self): + self.initialized = False + + def initialize( + self, + dummy_dict, + num_envs=2**12, + max_storage=2**17, + device="cpu", + ): + self.device = device + self.num_envs = num_envs + max_length = max_storage // num_envs + self.max_length = max_length + self.data = TensorDict({}, batch_size=(max_length, num_envs), device=device) + self.fill_count = 0 + self.add_index = 0 + + for key in dummy_dict.keys(): + if dummy_dict[key].dim() == 1: # if scalar + self.data[key] = torch.zeros( + (max_length, num_envs), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + else: + self.data[key] = torch.zeros( + (max_length, num_envs, dummy_dict[key].shape[1]), + dtype=dummy_dict[key].dtype, + device=self.device, + ) + + @torch.inference_mode + def add_transitions(self, transition: TensorDict): + if self.fill_count >= self.max_length and self.add_index >= self.max_length: + self.add_index = 0 + self.data[self.add_index] = transition + self.fill_count += 1 + self.add_index += 1 + + def get_data(self): + return self.data[: min(self.fill_count, self.max_length), :] + + def clear(self): + self.fill_count = 0 + self.add_index = 0 + with torch.inference_mode(): + for tensor in self.data: + tensor.zero_() diff --git a/learning/utils/__init__.py b/learning/utils/__init__.py index 15c674d6..ec798653 100644 --- a/learning/utils/__init__.py +++ b/learning/utils/__init__.py @@ -1,8 +1,5 @@ -from .utils import ( - remove_zero_weighted_rewards, - set_discount_from_horizon, -) +from .utils import * from .dict_utils import * from .logger import Logger from .PBRS.PotentialBasedRewardShaping import PotentialBasedRewardShaping \ No newline at end of file diff --git a/learning/utils/utils.py b/learning/utils/utils.py index 7a54c65a..d4eac980 100644 --- a/learning/utils/utils.py +++ b/learning/utils/utils.py @@ -17,3 +17,9 @@ def set_discount_from_horizon(dt, horizon): discount_factor = 1 - 1 / discrete_time_horizon return discount_factor + + +def polyak_update(online, target, polyak_factor): + for op, tp in zip(online.parameters(), target.parameters()): + tp.data.copy_((1.0 - polyak_factor) * op.data + polyak_factor * tp.data) + return target From d0106332e495ce1e503165d6301f01a3343328bb Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 8 Jul 2024 15:45:00 -0400 Subject: [PATCH 22/42] IPG-smooth learns decently (adapt LR) --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 5 ++--- learning/algorithms/ppo2.py | 3 +-- learning/algorithms/ppo_ipg.py | 13 +++++++++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index eba8c34a..e4ceada5 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,7 +73,7 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = False + smooth_exploration = True exploration_sample_freq = 16 normalize_obs = True @@ -151,7 +151,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_learning_epochs = 6 # mini batch size = num_envs*nsteps/nminibatches num_mini_batches = 4 - mini_batch_size = 2**15 # new desired_kl = 0.02 # 0.02 for smooth-expl, else 0.01 max_grad_norm = 1.0 @@ -170,6 +169,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 500 # number of policy updates + max_iterations = 700 # number of policy updates algorithm_class_name = "PPO_IPG" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index e5912e3f..7f4bb041 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -25,10 +25,9 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, - loss_fn="MSE", - device="cpu", lr_range=[1e-4, 1e-2], lr_ratio=1.3, + device="cpu", **kwargs, ): self.device = device diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index ba92b575..cd73215d 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -28,6 +28,8 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, polyak=0.995, use_cv=False, inter_nu=0.2, @@ -40,6 +42,8 @@ def __init__( self.desired_kl = desired_kl self.schedule = schedule self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio # * PPO components self.actor = actor.to(self.device) @@ -208,11 +212,16 @@ def update_actor(self, data_onpol, data_offpol): axis=-1, ) kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) for param_group in self.optimizer.param_groups: # ! check this From 35a65c2807b3e71d9d0c03c9c017f82b46f27fc6 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 10 Jul 2024 16:24:44 -0400 Subject: [PATCH 23/42] refactor for changes on dev --- gym/envs/base/legged_robot_config.py | 27 +++++--- gym/envs/mini_cheetah/mini_cheetah_config.py | 18 +---- .../mini_cheetah/mini_cheetah_ref_config.py | 20 +----- learning/algorithms/ppo2.py | 64 +++++++----------- learning/algorithms/ppo_ipg.py | 65 +++++++++---------- learning/modules/critic.py | 15 +++-- learning/runners/ipg_runner.py | 14 ++-- learning/runners/on_policy_runner.py | 60 +++++++++++++++-- learning/utils/dict_utils.py | 50 +++++++------- scripts/export_policy.py | 1 + 10 files changed, 177 insertions(+), 157 deletions(-) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 1f6462ce..2e73d924 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -289,20 +289,27 @@ class termination_weight: termination = 0.01 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 + lr_range = [2e-4, 1e-2] + lr_ratio = 1.3 class runner: policy_class_name = "ActorCritic" diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 2bd7958d..6de32218 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -192,23 +192,7 @@ class termination_weight: termination = 0.01 class algorithm(LeggedRobotRunnerCfg.algorithm): - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 6 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - discount_horizon = 1.0 # [s] - # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.02 - max_grad_norm = 1.0 - # * Learning rate - learning_rate = 0.002 - schedule = "adaptive" # can be adaptive or fixed - lr_range = [2e-4, 1e-2] - lr_ratio = 1.3 + pass class runner(LeggedRobotRunnerCfg.runner): run_name = "" diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index e4ceada5..30e614b0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -141,30 +141,14 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - # training params - gamma = 0.99 - lam = 0.95 - value_loss_coef = 1.0 # deprecate for PPO2 - use_clipped_value_loss = True # deprecate for PPO2 - clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 6 - # mini batch size = num_envs*nsteps/nminibatches - num_mini_batches = 4 - - desired_kl = 0.02 # 0.02 for smooth-expl, else 0.01 - max_grad_norm = 1.0 - learning_rate = 0.002 # initial value - schedule = "adaptive" # can be adaptive or fixed - lr_range = [3e-4, 1e-2] - lr_ratio = 1.3 # 1.3 for smooth-expl, else 1.5 + desired_kl = 0.02 # 0.02 for smooth-exploration, else 0.01 # IPG - storage_size = 4 * 32 * 4096 # old_policies*nsteps*num_envs polyak = 0.995 use_cv = False inter_nu = 0.2 beta = "off_policy" + storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs class runner(MiniCheetahRunnerCfg.runner): run_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 7f4bb041..6c64e334 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -5,8 +5,8 @@ from learning.utils import ( create_uniform_generator, compute_generalized_advantages, + normalize, ) -from learning.modules import SmoothActor class PPO2: @@ -14,8 +14,8 @@ def __init__( self, actor, critic, - num_learning_epochs=1, - num_mini_batches=1, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -25,9 +25,9 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, + device="cpu", lr_range=[1e-4, 1e-2], lr_ratio=1.3, - device="cpu", **kwargs, ): self.device = device @@ -46,18 +46,14 @@ def __init__( # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss - def test_mode(self): - self.actor.test() - self.critic.test() - def switch_to_train(self): self.actor.train() self.critic.train() @@ -65,30 +61,27 @@ def switch_to_train(self): def act(self, obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values + def update(self, data): + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic ) - + data["returns"] = data["advantages"] + data["values"] self.update_critic(data) + data["advantages"] = normalize(data["advantages"]) self.update_actor(data) def update_critic(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - value_batch = self.critic.evaluate(batch["critic_obs"]) - value_loss = self.critic.loss_fn(value_batch, batch["returns"]) + value_loss = self.critic.loss_fn(batch["critic_obs"], batch["returns"]) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) @@ -98,8 +91,6 @@ def update_critic(self, data): self.mean_value_loss /= counter def update_actor(self, data): - # already done before - # compute_generalized_advantages(data, self.gamma, self.lam, self.critic) self.mean_surrogate_loss = 0 counter = 0 @@ -110,16 +101,12 @@ def update_actor(self, data): data["actions"] ).detach() - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - # * Re-sample noise for smooth actor - if isinstance(self.actor, SmoothActor): - self.actor.sample_weights(batch_size) - - # ! refactor how this is done self.actor.update_distribution(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean @@ -171,10 +158,7 @@ def update_actor(self, data): # * Gradient step self.optimizer.zero_grad() loss.backward() - nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), - self.max_grad_norm, - ) + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += surrogate_loss.item() counter += 1 diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index cd73215d..000badec 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -5,9 +5,9 @@ from learning.utils import ( create_uniform_generator, compute_generalized_advantages, + normalize, polyak_update, ) -from learning.modules import SmoothActor class PPO_IPG: @@ -17,8 +17,8 @@ def __init__( critic_v, critic_q, target_critic_q, - num_learning_epochs=1, - num_mini_batches=1, + batch_size=2**15, + max_gradient_steps=10, clip_param=0.2, gamma=0.998, lam=0.95, @@ -28,13 +28,13 @@ def __init__( use_clipped_value_loss=True, schedule="fixed", desired_kl=0.01, - lr_range=[1e-4, 1e-2], - lr_ratio=1.3, polyak=0.995, use_cv=False, inter_nu=0.2, beta="off_policy", device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, **kwargs, ): self.device = device @@ -63,8 +63,8 @@ def __init__( # * PPO parameters self.clip_param = clip_param - self.num_learning_epochs = num_learning_epochs - self.num_mini_batches = num_mini_batches + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps self.entropy_coef = entropy_coef self.gamma = gamma self.lam = lam @@ -87,14 +87,16 @@ def act(self, obs): def update(self, data_onpol, data_offpol, last_obs=None): # On-policy GAE - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic_v.evaluate(last_obs).detach() - compute_generalized_advantages( - data_onpol, self.gamma, self.lam, self.critic_v, last_values + values = self.critic_v.evaluate(data_onpol["critic_obs"]) + # Handle single env case + if values.dim() == 1: + values = values.unsqueeze(-1) + data_onpol["values"] = values + data_onpol["advantages"] = compute_generalized_advantages( + data_onpol, self.gamma, self.lam, self.critic_v ) + data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"] + data_onpol["advantages"] = normalize(data_onpol["advantages"]) self.update_critic_q(data_offpol) @@ -116,10 +118,11 @@ def update_critic_q(self, data): self.mean_q_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: with torch.no_grad(): # TODO: check that should be inference @@ -133,8 +136,7 @@ def update_critic_q(self, data): + self.gamma * batch["dones"].logical_not() * q_next ) q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) - q_value = self.critic_q.evaluate(q_input) - q_loss = self.critic_q.loss_fn(q_value, q_target) + q_loss = self.critic_q.loss_fn(q_input, q_target) self.critic_q_optimizer.zero_grad() q_loss.backward() nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm) @@ -152,13 +154,13 @@ def update_critic_v(self, data): self.mean_value_loss = 0 counter = 0 - n, m = data.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches - generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) for batch in generator: - value_batch = self.critic_v.evaluate(batch["critic_obs"]) - value_loss = self.critic_v.loss_fn(value_batch, batch["returns"]) + value_loss = self.critic_v.loss_fn(batch["critic_obs"], batch["returns"]) self.critic_v_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic_v.parameters(), self.max_grad_norm) @@ -180,17 +182,12 @@ def update_actor(self, data_onpol, data_offpol): ).detach() # Generate off-policy batches and use all on-policy data - n, m = data_offpol.shape - total_data = n * m - batch_size = total_data // self.num_mini_batches generator_offpol = create_uniform_generator( - data_offpol, batch_size, self.num_learning_epochs + data_offpol, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, ) for batch_offpol in generator_offpol: - # * Re-sample noise for smooth actor - if isinstance(self.actor, SmoothActor): - self.actor.sample_weights(batch_size) - self.actor.update_distribution(data_onpol["actor_obs"]) actions_log_prob_onpol = self.actor.get_actions_log_prob( data_onpol["actions"] diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 3ee930dc..96c53438 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -15,16 +15,19 @@ def __init__( ): super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) - def evaluate(self, critic_observations): + def forward(self, x): if self._normalize_obs: with torch.no_grad(): - critic_observations = self.obs_rms(critic_observations) - return self.NN(critic_observations).squeeze() + x = self.obs_rms(x) + return self.NN(x).squeeze() + + def evaluate(self, critic_observations): + return self.forward(critic_observations) - def loss_fn(self, input, target): - return nn.functional.mse_loss(input, target, reduction="mean") + def loss_fn(self, obs, target): + return nn.functional.mse_loss(self.forward(obs), target, reduction="mean") diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py index ecc8a39b..bb8bb044 100644 --- a/learning/runners/ipg_runner.py +++ b/learning/runners/ipg_runner.py @@ -6,9 +6,9 @@ from learning.utils import Logger from .BaseRunner import BaseRunner +from learning.algorithms import * # noqa: F403 from learning.modules import Actor, SmoothActor, Critic from learning.storage import DictStorage, ReplayBuffer -from learning.algorithms import PPO_IPG logger = Logger() storage_onpol = DictStorage() @@ -26,10 +26,11 @@ def __init__(self, env, train_cfg, device="cpu"): ) def _set_up_alg(self): - alg_class = eval(self.cfg["algorithm_class_name"]) - if alg_class != PPO_IPG: + alg_class_name = self.cfg["algorithm_class_name"] + if alg_class_name != "PPO_IPG": raise ValueError("IPGRunner only supports PPO_IPG") + alg_class = eval(alg_class_name) num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) @@ -170,7 +171,7 @@ def learn(self): self.save() @torch.no_grad - def burn_in_normalization(self, n_iterations=200): + def burn_in_normalization(self, n_iterations=100): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) for _ in range(n_iterations): @@ -181,11 +182,12 @@ def burn_in_normalization(self, n_iterations=200): self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - self.alg.critic_v.evaluate(critic_obs) + # TODO: Check this, seems to perform better without critic eval + # self.alg.critic_v.evaluate(critic_obs) q_input = torch.cat((critic_obs, actions), dim=-1) self.alg.critic_q.evaluate(q_input) self.alg.target_critic_q.evaluate(q_input) - # self.env.reset() + self.env.reset() def update_rewards(self, rewards_dict, terminated): rewards_dict.update( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index e5f000f8..c98d088c 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -41,8 +41,10 @@ def learn(self, states_to_log_dict=None): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -62,6 +64,13 @@ def learn(self, states_to_log_dict=None): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): @@ -102,6 +111,8 @@ def learn(self, states_to_log_dict=None): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, @@ -130,7 +141,7 @@ def learn(self, states_to_log_dict=None): self.save() @torch.no_grad - def burn_in_normalization(self, n_iterations=200): + def burn_in_normalization(self, n_iterations=100): actor_obs = self.get_obs(self.actor_cfg["obs"]) # critic_obs = self.get_obs(self.critic_cfg["obs"]) for _ in range(n_iterations): @@ -140,10 +151,10 @@ def burn_in_normalization(self, n_iterations=200): actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) - # TODO: check this, trains better without evaluating critic + # TODO: Check this, seems to perform better without critic eval # critic_obs = self.get_obs(self.critic_cfg["obs"]) # self.alg.critic.evaluate(critic_obs) - # self.env.reset() + self.env.reset() def update_rewards(self, rewards_dict, terminated): rewards_dict.update( @@ -170,7 +181,9 @@ def set_up_logger(self): self.alg, ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + logger.register_category( + "actor", self.alg.actor, ["action_std", "entropy", "get_std"] + ) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @@ -209,3 +222,42 @@ def get_inference_actions(self): def export(self, path): self.alg.actor.export(path) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + with torch.inference_mode(): + for i in range(steps): + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 2d19e072..99534c22 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -9,30 +9,29 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): else: last_values = critic.evaluate(data["critic_obs"][-1]) - data.update({"returns": torch.zeros_like(data["rewards"])}) - data["returns"][-1] = last_values * ~data["dones"][-1] + returns = torch.zeros_like(data["rewards"]) + returns[-1] = last_values * ~data["dones"][-1] for k in reversed(range(data["rewards"].shape[0] - 1)): not_done = ~data["dones"][k] - data["returns"][k] = ( - data["rewards"][k] + gamma * data["returns"][k + 1] * not_done - ) - data["returns"] = (data["returns"] - data["returns"].mean()) / ( - data["returns"].std() + 1e-8 - ) - return + returns[k] = data["rewards"][k] + gamma * returns[k + 1] * not_done + + return normalize(returns) @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - data.update({"values": critic.evaluate(data["critic_obs"])}) +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) - data.update({"advantages": torch.zeros_like(data["values"])}) +@torch.no_grad +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) + advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] - data["advantages"][-1] = ( + advantages[-1] = ( data["rewards"][-1] + gamma * data["values"][-1] * data["timed_out"][-1] + gamma * last_values * not_done @@ -47,23 +46,30 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + gamma * data["values"][k + 1] * not_done - data["values"][k] ) - data["advantages"][k] = ( - td_error + gamma * lam * not_done * data["advantages"][k + 1] - ) + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - data["returns"] = data["advantages"] + data["values"] - - data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( - data["advantages"].std() + 1e-8 - ) + return advantages # todo change num_epochs to num_batches @torch.no_grad -def create_uniform_generator(data, batch_size, num_epochs, keys=None): +def create_uniform_generator( + data, batch_size, num_epochs=1, max_gradient_steps=None, keys=None +): n, m = data.shape total_data = n * m + + if batch_size > total_data: + Warning("Batch size is larger than total data, using available data only.") + batch_size = total_data + num_batches_per_epoch = total_data // batch_size + if max_gradient_steps: + if max_gradient_steps < num_batches_per_epoch: + num_batches_per_epoch = max_gradient_steps + num_epochs = max_gradient_steps // num_batches_per_epoch + num_epochs = max(num_epochs, 1) + for epoch in range(num_epochs): indices = torch.randperm(total_data, device=data.device) for i in range(num_batches_per_epoch): diff --git a/scripts/export_policy.py b/scripts/export_policy.py index a5897359..d591ea14 100644 --- a/scripts/export_policy.py +++ b/scripts/export_policy.py @@ -22,6 +22,7 @@ def setup_and_export(args): LEGGED_GYM_ROOT_DIR, "logs", train_cfg.runner.experiment_name, + train_cfg.runner.load_run, "exported", ) runner.export(path) From 5ce0d6e4c2d44425ee28539d36e7458253b32c0c Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 10 Jul 2024 20:14:34 -0400 Subject: [PATCH 24/42] finetune using robot-software logs --- gym/envs/mini_cheetah/minimalist_cheetah.py | 71 +++++++++ learning/runners/finetune_runner.py | 73 +++++++++ scripts/finetune.py | 159 ++++++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 gym/envs/mini_cheetah/minimalist_cheetah.py create mode 100644 learning/runners/finetune_runner.py create mode 100644 scripts/finetune.py diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py new file mode 100644 index 00000000..8da8b9d9 --- /dev/null +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -0,0 +1,71 @@ +import torch + + +class MinimalistCheetah: + """ + Helper class for computing mini cheetah rewards + """ + + def __init__(self, device="cpu", tracking_sigma=0.25): + self.device = device + self.tracking_sigma = tracking_sigma + + def set_states( + self, + base_height, + base_lin_vel, + base_ang_vel, + proj_gravity, + commands, + phase_obs, + grf, + ): + # Unsqueeze so first dim is batch_size + self.base_height = torch.tensor(base_height, device=self.device).unsqueeze(0) + self.base_lin_vel = torch.tensor(base_lin_vel, device=self.device).unsqueeze(0) + self.base_ang_vel = torch.tensor(base_ang_vel, device=self.device).unsqueeze(0) + self.proj_gravity = torch.tensor(proj_gravity, device=self.device).unsqueeze(0) + self.commands = torch.tensor(commands, device=self.device).unsqueeze(0) + self.grf = torch.tensor(grf, device=self.device).unsqueeze(0) + + phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0) + self.phase_sin = torch.sin(phase_obs) + + def _switch(self): + c_vel = torch.linalg.norm(self.commands, dim=1) + return torch.exp( + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 + ) + + def _reward_tracking_lin_vel(self): + """Tracking of linear velocity commands (xy axes)""" + # just use lin_vel? + error = self.commands[:, :2] - self.base_lin_vel[:, :2] + # * scale by (1+|cmd|): if cmd=0, no scaling. + error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2])) + error = torch.sum(torch.square(error), dim=1) + return torch.exp(-error / self.tracking_sigma) + + def _reward_tracking_ang_vel(self): + """Tracking of angular velocity commands (yaw)""" + ang_vel_error = torch.square(self.commands[:, 2] - self.base_ang_vel[:, 2]) + return torch.exp(-ang_vel_error / self.tracking_sigma) + + def _reward_orientation(self): + """Penalize non-flat base orientation""" + error = torch.square(self.proj_gravity[:, :2]) / self.tracking_sigma + return torch.sum(torch.exp(-error), dim=1) + + def _reward_swing_grf(self): + """Reward non-zero grf during swing (0 to pi)""" + in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0) + ph_off = torch.gt(self.phase_sin, 0) # phase <= pi + rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) + return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) + + def _reward_stance_grf(self): + """Reward non-zero grf during stance (pi to 2pi)""" + in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0) + ph_off = torch.lt(self.phase_sin, 0) # phase >= pi + rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) + return torch.sum(rew.float(), dim=1) * (1 - self._switch()) diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py new file mode 100644 index 00000000..deace319 --- /dev/null +++ b/learning/runners/finetune_runner.py @@ -0,0 +1,73 @@ +import torch +from learning.algorithms import * # noqa: F403 +from learning.modules import Actor, SmoothActor, Critic +from learning.utils import remove_zero_weighted_rewards + + +class FineTuneRunner: + def __init__(self, train_cfg, data_dict, device="cpu"): + self.parse_train_cfg(train_cfg) + self.data_dict = data_dict + self.device = device + + def _set_up_alg(self): + num_actor_obs = self.data_dict["actor_obs"].shape[-1] + num_actions = self.data_dict["actions"].shape[-1] + num_critic_obs = self.data_dict["critic_obs"].shape[-1] + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + critic = Critic(num_critic_obs, **self.critic_cfg) + alg_class = eval(self.cfg["algorithm_class_name"]) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + def parse_train_cfg(self, train_cfg): + self.cfg = train_cfg["runner"] + self.alg_cfg = train_cfg["algorithm"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) + self.actor_cfg = train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] + + def learn(self): + # Single update on data dict + self.alg.update(self.data_dict) + + def save(self, path): + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": 1, # only one iteration + }, + path, + ) + + def load(self, path, load_optimizer=True): + loaded_dict = torch.load(path) + self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) + + def export(self, path): + # Need to make a copy of actor + if self.actor_cfg["smooth_exploration"]: + actor_copy = SmoothActor( + self.alg.actor.num_obs, self.alg.actor.num_actions, **self.actor_cfg + ) + else: + actor_copy = Actor( + self.alg.actor.num_obs, self.alg.actor.num_actions, **self.actor_cfg + ) + state_dict = { + name: param.detach().clone() + for name, param in self.alg.actor.state_dict().items() + } + actor_copy.load_state_dict(state_dict) + actor_copy.export(path) diff --git a/scripts/finetune.py b/scripts/finetune.py new file mode 100644 index 00000000..63c55328 --- /dev/null +++ b/scripts/finetune.py @@ -0,0 +1,159 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry +from gym.utils.helpers import class_to_dict + +from learning.algorithms import * # noqa: F403 +from learning.runners.finetune_runner import FineTuneRunner +from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah + +from gym import LEGGED_GYM_ROOT_DIR + +import os +import torch +import scipy.io +from tensordict import TensorDict + +ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" +LOAD_RUN = "Jul08_13-02-15_PPO32_S16" +LOG_FILE = "PPO32_S16_expl.mat" +ITERATION = 500 + +# Data struct fields from Robot-Software logs +DATA_LIST = [ + "header", + "base_height", # shape (1, batch_size) + # the following are all shape (batch_size, n) + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + "grf", + "dof_pos_target", + "exploration_noise", + "footer", +] + +# Finetune for these rewards instead of the ones in the cfg +REWARD_WEIGHTS = { + # "tracking_lin_vel": 4.0, + "tracking_ang_vel": 2.0, + "orientation": 1.0, + "swing_grf": 1.5, + "stance_grf": 1.5, +} + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def get_obs(obs_list, data_struct, batch_size): + obs_all = torch.empty(0).to(DEVICE) + for obs_name in obs_list: + data_idx = DATA_LIST.index(obs_name) + obs = torch.tensor(data_struct[data_idx]).to(DEVICE) + obs = obs.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) + obs_all = torch.cat((obs_all, obs), dim=-1) + + return obs_all.float() + + +def get_rewards(data_struct, batch_size): + rewards_all = torch.empty(0).to(DEVICE) + for i in range(batch_size): + mini_cheetah = MinimalistCheetah(device=DEVICE) + mini_cheetah.set_states( + base_height=data_struct[1][:, i], # shape (1, batch_size) + base_lin_vel=data_struct[2][i], + base_ang_vel=data_struct[3][i], + proj_gravity=data_struct[4][i], + commands=data_struct[5][i], + phase_obs=data_struct[8][i], + grf=data_struct[9][i], + ) + total_rewards = 0 + for name, weight in REWARD_WEIGHTS.items(): + reward = eval(f"mini_cheetah._reward_{name}()") + total_rewards += weight * reward + rewards_all = torch.cat((rewards_all, total_rewards), dim=0) + + return rewards_all.float() + + +def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): + path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE) + data = scipy.io.loadmat(path) + data_struct = data[name][0][0] + batch_size = data_struct[1].shape[1] # base_height: shape (1, batch_size) + data_dict = TensorDict( + {}, + device=DEVICE, + batch_size=(batch_size - 1, 1), # -1 for next_obs + ) + + actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size) + critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size) + + data_dict["actor_obs"] = actor_obs[:-1] + data_dict["next_actor_obs"] = actor_obs[1:] + data_dict["critic_obs"] = critic_obs[:-1] + data_dict["next_critic_obs"] = critic_obs[1:] + + actions_idx = DATA_LIST.index("dof_pos_target") + actions = torch.tensor(data_struct[actions_idx]).to(DEVICE) + actions = actions.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) + data_dict["actions"] = actions[:-1] + + rewards = get_rewards(data_struct, batch_size) + rewards = rewards.reshape((batch_size, 1)) # shape (batch_size, 1) + data_dict["rewards"] = rewards[:-1] + + # Assume no termination + data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + + return data_dict + + +def setup(): + args = get_args() + + _, train_cfg = task_registry.create_cfgs(args) + train_cfg = class_to_dict(train_cfg) + data_dict = get_data_dict(train_cfg) + + runner = FineTuneRunner(train_cfg, data_dict, device=DEVICE) + runner._set_up_alg() + + return runner + + +def finetune(runner): + load_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION) + ".pt") + print("Loading model from: ", load_path) + runner.load(load_path) + + # Perform a single update + runner.learn() + + # Compare old and new actions + scaling = torch.tensor([0.2, 0.3, 0.3], device=DEVICE).repeat(4) + actions_old = runner.data_dict["actions"] + actions_new = scaling * runner.alg.actor.act_inference( + runner.data_dict["actor_obs"] + ) + diff = actions_new - actions_old + print("Mean action diff per actuator: ", diff.mean(dim=0)) + + save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt") + runner.save(save_path) + + # export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned") + # runner.export(export_path) + + +if __name__ == "__main__": + runner = setup() + finetune(runner) From cb75936cb925febed68959bd285ed8ce29970680 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 10 Jul 2024 20:16:13 -0400 Subject: [PATCH 25/42] handle single env dimension at critic --- learning/algorithms/ppo_ipg.py | 8 ++------ learning/modules/critic.py | 2 +- learning/modules/utils/neural_net.py | 8 +++++--- learning/modules/utils/normalize.py | 2 +- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index 000badec..e377579f 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -85,13 +85,9 @@ def switch_to_train(self): def act(self, obs): return self.actor.act(obs).detach() - def update(self, data_onpol, data_offpol, last_obs=None): + def update(self, data_onpol, data_offpol): # On-policy GAE - values = self.critic_v.evaluate(data_onpol["critic_obs"]) - # Handle single env case - if values.dim() == 1: - values = values.unsqueeze(-1) - data_onpol["values"] = values + data_onpol["values"] = self.critic_v.evaluate(data_onpol["critic_obs"]) data_onpol["advantages"] = compute_generalized_advantages( data_onpol, self.gamma, self.lam, self.critic_v ) diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 96c53438..f2e8678f 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -24,7 +24,7 @@ def forward(self, x): if self._normalize_obs: with torch.no_grad(): x = self.obs_rms(x) - return self.NN(x).squeeze() + return self.NN(x).squeeze(-1) def evaluate(self, critic_observations): return self.forward(critic_observations) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 6e6f41d6..d452da57 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,6 +1,7 @@ import torch import os -import copy + +# import copy import numpy as np @@ -78,7 +79,8 @@ def export_network(network, network_name, path, num_inputs, latent=True): os.makedirs(path, exist_ok=True) path_TS = os.path.join(path, network_name + ".pt") # TorchScript path path_onnx = os.path.join(path, network_name + ".onnx") # ONNX path - model = copy.deepcopy(network).to("cpu") + # model = copy.deepcopy(network).to("cpu") + model = network.to("cpu") # no deepcopy # To trace model, must be evaluated once with arbitrary input model.eval() dummy_input = torch.rand((num_inputs)) @@ -98,5 +100,5 @@ def export_network(network, network_name, path, num_inputs, latent=True): # Save actor std of shape (num_actions, latent_dim) # It is important that the shape is the same as the exploration matrix path_std = os.path.join(path, network_name + "_std.txt") - std_transposed = model.get_std.numpy().T + std_transposed = model.get_std.detach().numpy().T np.savetxt(path_std, std_transposed) diff --git a/learning/modules/utils/normalize.py b/learning/modules/utils/normalize.py index 246bafa4..0a60dcbf 100644 --- a/learning/modules/utils/normalize.py +++ b/learning/modules/utils/normalize.py @@ -41,7 +41,7 @@ def _update_mean_var_from_moments( def forward(self, input): if self.training: mean = input.mean(tuple(range(input.dim() - 1))) - var = input.var(tuple(range(input.dim() - 1))) + var = torch.nan_to_num(input.var(tuple(range(input.dim() - 1)))) ( self.running_mean, self.running_var, From f5829f079c66c66bb3c4d8f362e184c16ed06311 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Thu, 11 Jul 2024 14:25:42 -0400 Subject: [PATCH 26/42] update finetuning: exploration scale, grf rewards, plotting --- .../mini_cheetah/mini_cheetah_ref_config.py | 9 ++- gym/envs/mini_cheetah/minimalist_cheetah.py | 14 ++-- learning/modules/smooth_actor.py | 22 +++--- learning/runners/finetune_runner.py | 10 ++- scripts/finetune.py | 79 +++++++++++++------ 5 files changed, 89 insertions(+), 45 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 30e614b0..09970e66 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "IPGRunner" + runner_class_name = "OnPolicyRunner" class actor: hidden_dims = [256, 256, 128] @@ -150,9 +150,14 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): beta = "off_policy" storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs + # Finetuning + learning_rate = 3e-4 + max_gradient_steps = 10 + batch_size = 4096 + class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 700 # number of policy updates - algorithm_class_name = "PPO_IPG" + algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index 8da8b9d9..658bb9a3 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -29,7 +29,7 @@ def set_states( self.grf = torch.tensor(grf, device=self.device).unsqueeze(0) phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0) - self.phase_sin = torch.sin(phase_obs) + self.phase_sin = phase_obs[:, 0].unsqueeze(0) def _switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) @@ -56,16 +56,16 @@ def _reward_orientation(self): error = torch.square(self.proj_gravity[:, :2]) / self.tracking_sigma return torch.sum(torch.exp(-error), dim=1) - def _reward_swing_grf(self): + def _reward_swing_grf(self, contact_thresh=0.5): """Reward non-zero grf during swing (0 to pi)""" - in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0) + in_contact = torch.gt(self.grf, contact_thresh) ph_off = torch.gt(self.phase_sin, 0) # phase <= pi rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return -torch.sum(rew.float(), dim=1) # * (1 - self._switch()) - def _reward_stance_grf(self): + def _reward_stance_grf(self, contact_thresh=0.5): """Reward non-zero grf during stance (pi to 2pi)""" - in_contact = torch.gt(torch.norm(self.grf, dim=-1), 50.0) + in_contact = torch.gt(self.grf, contact_thresh) ph_off = torch.lt(self.phase_sin, 0) # phase >= pi rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return torch.sum(rew.float(), dim=1) # * (1 - self._switch()) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 5cb1da89..72c4ace1 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -12,8 +12,9 @@ # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 class SmoothActor(Actor): weights_dist: Normal - _latent_sde: torch.Tensor + latent_sde: torch.Tensor exploration_matrices: torch.Tensor + exploration_scale: float def __init__( self, @@ -23,6 +24,7 @@ def __init__( learn_features: bool = True, epsilon: float = 1e-6, log_std_init: float = 0.0, + exploration_scale: float = 1.0, **kwargs, ): super().__init__(*args, **kwargs) @@ -31,6 +33,7 @@ def __init__( self.learn_features = learn_features self.epsilon = epsilon self.log_std_init = log_std_init + self.exploration_scale = exploration_scale # for finetuning # Create latent NN and last layer self.latent_net = create_MLP( @@ -88,16 +91,17 @@ def update_distribution(self, observations): with torch.no_grad(): observations = self.obs_rms(observations) # Get latent features and compute distribution - self._latent_sde = self.latent_net(observations) + self.latent_sde = self.latent_net(observations) + std_scaled = self.get_std * self.exploration_scale if not self.learn_features: - self._latent_sde = self._latent_sde.detach() - if self._latent_sde.dim() == 2: - variance = torch.mm(self._latent_sde**2, self.get_std**2) - elif self._latent_sde.dim() == 3: - variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2) + self.latent_sde = self.latent_sde.detach() + if self.latent_sde.dim() == 2: + variance = torch.mm(self.latent_sde**2, std_scaled**2) + elif self.latent_sde.dim() == 3: + variance = torch.einsum("abc,cd->abd", self.latent_sde**2, std_scaled**2) else: raise ValueError("Invalid latent_sde dimension") - mean_actions = self.mean_actions_net(self._latent_sde) + mean_actions = self.mean_actions_net(self.latent_sde) self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) def act(self, observations): @@ -118,7 +122,7 @@ def act_inference(self, observations): return mean_actions def get_noise(self): - latent_sde = self._latent_sde + latent_sde = self.latent_sde if not self.learn_features: latent_sde = latent_sde.detach() # Use batch matrix multiplication for efficient computation diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index deace319..4cfdf895 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -5,9 +5,10 @@ class FineTuneRunner: - def __init__(self, train_cfg, data_dict, device="cpu"): + def __init__(self, train_cfg, data_dict, exploration_scale=1.0, device="cpu"): self.parse_train_cfg(train_cfg) self.data_dict = data_dict + self.exploration_scale = exploration_scale self.device = device def _set_up_alg(self): @@ -15,7 +16,12 @@ def _set_up_alg(self): num_actions = self.data_dict["actions"].shape[-1] num_critic_obs = self.data_dict["critic_obs"].shape[-1] if self.actor_cfg["smooth_exploration"]: - actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + actor = SmoothActor( + num_obs=num_actor_obs, + num_actions=num_actions, + exploration_scale=self.exploration_scale, + **self.actor_cfg, + ) else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) diff --git a/scripts/finetune.py b/scripts/finetune.py index 63c55328..7fc3f270 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -12,17 +12,22 @@ import torch import scipy.io from tensordict import TensorDict +import matplotlib.pyplot as plt ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" LOAD_RUN = "Jul08_13-02-15_PPO32_S16" -LOG_FILE = "PPO32_S16_expl.mat" +LOG_FILE = "PPO32_S16_Jul08_expl08.mat" + +TRAJ_LENGTH = 100 # split data into chunks of this length +EXPLORATION_SCALE = 0.8 # scale std in SmoothActor ITERATION = 500 +PLOT = True # Data struct fields from Robot-Software logs DATA_LIST = [ "header", - "base_height", # shape (1, batch_size) - # the following are all shape (batch_size, n) + "base_height", # shape (1, data_length) + # the following are all shape (data_length, n) "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -39,8 +44,8 @@ # Finetune for these rewards instead of the ones in the cfg REWARD_WEIGHTS = { # "tracking_lin_vel": 4.0, - "tracking_ang_vel": 2.0, - "orientation": 1.0, + # "tracking_ang_vel": 2.0, + # "orientation": 1.0, "swing_grf": 1.5, "stance_grf": 1.5, } @@ -48,23 +53,27 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") -def get_obs(obs_list, data_struct, batch_size): +def get_obs(obs_list, data_struct, batches): obs_all = torch.empty(0).to(DEVICE) for obs_name in obs_list: data_idx = DATA_LIST.index(obs_name) obs = torch.tensor(data_struct[data_idx]).to(DEVICE) - obs = obs.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) + if obs.shape[0] == 1: + obs = obs.T # base_height has shape (1, data_length) + obs = obs[: TRAJ_LENGTH * batches, :] + obs = obs.reshape((TRAJ_LENGTH, batches, -1)) obs_all = torch.cat((obs_all, obs), dim=-1) return obs_all.float() -def get_rewards(data_struct, batch_size): +def get_rewards(data_struct, batches): + rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()} # for plotting rewards_all = torch.empty(0).to(DEVICE) - for i in range(batch_size): + for i in range(TRAJ_LENGTH * batches): mini_cheetah = MinimalistCheetah(device=DEVICE) mini_cheetah.set_states( - base_height=data_struct[1][:, i], # shape (1, batch_size) + base_height=data_struct[1][:, i], # shape (1, data_length) base_lin_vel=data_struct[2][i], base_ang_vel=data_struct[3][i], proj_gravity=data_struct[4][i], @@ -75,8 +84,14 @@ def get_rewards(data_struct, batch_size): total_rewards = 0 for name, weight in REWARD_WEIGHTS.items(): reward = eval(f"mini_cheetah._reward_{name}()") + rewards_dict[name].append(reward.item()) total_rewards += weight * reward rewards_all = torch.cat((rewards_all, total_rewards), dim=0) + rewards_all = rewards_all.reshape((TRAJ_LENGTH, batches)) + + if PLOT: + for name, rewards in rewards_dict.items(): + plt.plot(rewards, label=name) return rewards_all.float() @@ -85,15 +100,16 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE) data = scipy.io.loadmat(path) data_struct = data[name][0][0] - batch_size = data_struct[1].shape[1] # base_height: shape (1, batch_size) + + data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) + num_trajs = data_length // TRAJ_LENGTH data_dict = TensorDict( {}, device=DEVICE, - batch_size=(batch_size - 1, 1), # -1 for next_obs + batch_size=(TRAJ_LENGTH - 1, num_trajs), # -1 for next_obs ) - - actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size) - critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size) + actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, num_trajs) + critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, num_trajs) data_dict["actor_obs"] = actor_obs[:-1] data_dict["next_actor_obs"] = actor_obs[1:] @@ -102,17 +118,23 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): actions_idx = DATA_LIST.index("dof_pos_target") actions = torch.tensor(data_struct[actions_idx]).to(DEVICE) - actions = actions.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) + actions = actions[: TRAJ_LENGTH * num_trajs, :] + actions = actions.reshape((TRAJ_LENGTH, num_trajs, -1)) data_dict["actions"] = actions[:-1] - rewards = get_rewards(data_struct, batch_size) - rewards = rewards.reshape((batch_size, 1)) # shape (batch_size, 1) + rewards = get_rewards(data_struct, num_trajs) data_dict["rewards"] = rewards[:-1] # Assume no termination - data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) - data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) - data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + data_dict["timed_out"] = torch.zeros( + TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool + ) + data_dict["terminal"] = torch.zeros( + TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool + ) + data_dict["dones"] = torch.zeros( + TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool + ) return data_dict @@ -124,7 +146,9 @@ def setup(): train_cfg = class_to_dict(train_cfg) data_dict = get_data_dict(train_cfg) - runner = FineTuneRunner(train_cfg, data_dict, device=DEVICE) + runner = FineTuneRunner( + train_cfg, data_dict, exploration_scale=EXPLORATION_SCALE, device=DEVICE + ) runner._set_up_alg() return runner @@ -145,13 +169,18 @@ def finetune(runner): runner.data_dict["actor_obs"] ) diff = actions_new - actions_old - print("Mean action diff per actuator: ", diff.mean(dim=0)) + print("Mean action diff per actuator: ", diff.mean(dim=(0, 1))) save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt") runner.save(save_path) - # export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned") - # runner.export(export_path) + export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned") + runner.export(export_path) + + if PLOT: + plt.title("Rewards") + plt.legend() + plt.show() if __name__ == "__main__": From f3a4cc426c7749a664e0da50c6e0b9ef9a38fc23 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Thu, 11 Jul 2024 14:56:04 -0400 Subject: [PATCH 27/42] added min base height reward --- gym/envs/mini_cheetah/minimalist_cheetah.py | 16 ++++++++++++++-- scripts/finetune.py | 4 +++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index 658bb9a3..c87db2e5 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -31,12 +31,24 @@ def set_states( phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0) self.phase_sin = phase_obs[:, 0].unsqueeze(0) - def _switch(self): + def _sqrdexp(self, x, scale=1.0): + """shorthand helper for squared exponential""" + return torch.exp(-torch.square(x / scale) / self.tracking_sigma) + + def _switch(self, scale=0.5): + # TODO: Check scale, RS commands are scaled differently than QGym c_vel = torch.linalg.norm(self.commands, dim=1) return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - scale)) / scale ) + def _reward_min_base_height(self, target_height=0.3, scale=0.3): + """Squared exponential saturating at base_height target""" + # TODO: Check scale + error = (self.base_height - target_height) / scale + error = torch.clamp(error, max=0, min=None).flatten() + return self._sqrdexp(error) + def _reward_tracking_lin_vel(self): """Tracking of linear velocity commands (xy axes)""" # just use lin_vel? diff --git a/scripts/finetune.py b/scripts/finetune.py index 7fc3f270..ebeaa6dc 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -43,6 +43,7 @@ # Finetune for these rewards instead of the ones in the cfg REWARD_WEIGHTS = { + "min_base_height": 1.5, # "tracking_lin_vel": 4.0, # "tracking_ang_vel": 2.0, # "orientation": 1.0, @@ -91,7 +92,8 @@ def get_rewards(data_struct, batches): if PLOT: for name, rewards in rewards_dict.items(): - plt.plot(rewards, label=name) + n_steps = 200 + plt.plot(rewards[:n_steps], label=name) return rewards_all.float() From fdc028fbcc0d05a00f144873c68fd40a7d9a9ffb Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sat, 13 Jul 2024 19:20:45 -0400 Subject: [PATCH 28/42] reset 10% standstill/pure lin/pure ang vel; set rewards: grf=3, standstill=2 --- gym/envs/mini_cheetah/mini_cheetah_ref.py | 20 ++++++++++++---- .../mini_cheetah/mini_cheetah_ref_config.py | 14 +++++------ learning/runners/on_policy_runner.py | 4 +--- scripts/finetune.py | 23 ++++++++++++------- scripts/generate_commands.py | 21 +++++++++++++++++ 5 files changed, 59 insertions(+), 23 deletions(-) create mode 100644 scripts/generate_commands.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 57e1f0e7..e620a3c2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -44,11 +44,21 @@ def _post_decimation_step(self): def _resample_commands(self, env_ids): super()._resample_commands(env_ids) - # * with 10% chance, reset to 0 commands - rand_ids = torch_rand_float( - 0, 1, (len(env_ids), 1), device=self.device - ).squeeze(1) - self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1) + # * with 10% chance, reset to 0 commands except for forward + self.commands[env_ids, 1:] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.9 + ).unsqueeze(1) + # * with 10% chance, reset to 0 commands except for rotation + self.commands[env_ids, :2] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.9 + ).unsqueeze(1) + # * with 10% chance, reset to 0 + self.commands[env_ids, :] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.9 + ).unsqueeze(1) def _switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 09970e66..153e54ca 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -129,13 +129,13 @@ class weights: collision = 0.0 action_rate = 0.01 action_rate2 = 0.001 - stand_still = 0.0 + stand_still = 2.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 - reference_traj = 1.5 - swing_grf = 1.5 - stance_grf = 1.5 + reference_traj = 0.0 + swing_grf = 3.0 + stance_grf = 3.0 class termination_weight: termination = 0.15 @@ -151,9 +151,9 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs # Finetuning - learning_rate = 3e-4 - max_gradient_steps = 10 - batch_size = 4096 + # learning_rate = 3e-4 + # max_gradient_steps = 10 + # batch_size = 4096 class runner(MiniCheetahRunnerCfg.runner): run_name = "" diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index c98d088c..c3465433 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -181,9 +181,7 @@ def set_up_logger(self): self.alg, ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) - logger.register_category( - "actor", self.alg.actor, ["action_std", "entropy", "get_std"] - ) + logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) diff --git a/scripts/finetune.py b/scripts/finetune.py index ebeaa6dc..41c24209 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -11,17 +11,20 @@ import os import torch import scipy.io +import numpy as np from tensordict import TensorDict import matplotlib.pyplot as plt ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -LOAD_RUN = "Jul08_13-02-15_PPO32_S16" -LOG_FILE = "PPO32_S16_Jul08_expl08.mat" +LOAD_RUN = "Jul11_17-55-01_PPO32_S16_grf3" +LOG_FILE = "PPO32_S16_grf3_expl1.mat" TRAJ_LENGTH = 100 # split data into chunks of this length -EXPLORATION_SCALE = 0.8 # scale std in SmoothActor -ITERATION = 500 +EXPLORATION_SCALE = 1 # scale std in SmoothActor +ITERATION = 700 # load this iteration + PLOT = True +PLOT_N = 300 # number of steps to plot # Data struct fields from Robot-Software logs DATA_LIST = [ @@ -90,10 +93,14 @@ def get_rewards(data_struct, batches): rewards_all = torch.cat((rewards_all, total_rewards), dim=0) rewards_all = rewards_all.reshape((TRAJ_LENGTH, batches)) - if PLOT: - for name, rewards in rewards_dict.items(): - n_steps = 200 - plt.plot(rewards[:n_steps], label=name) + rewards_stats = {} + for name, rewards in rewards_dict.items(): + rewards = np.array(rewards) + rewards_stats[name] = [rewards.mean(), rewards.std()] + if PLOT: + plt.plot(rewards[:PLOT_N], label=name) + + print(rewards_stats) return rewards_all.float() diff --git a/scripts/generate_commands.py b/scripts/generate_commands.py new file mode 100644 index 00000000..c1de1425 --- /dev/null +++ b/scripts/generate_commands.py @@ -0,0 +1,21 @@ +import numpy as np +import random + +# Command ranges +x_range = [-0.5, 1] # [m/s] +y_range = [-0.25, 0.25] # [m/s] +yaw_range = [-1, 1] # [rad/s] + +# Generate random command sequence +N = 10 +steps = 300 +commands = np.zeros((N * steps, 3)) + +for i in range(1, N): # first iteration 0 commands + x = random.uniform(x_range[0], x_range[1]) + y = random.uniform(y_range[0], y_range[1]) + yaw = random.uniform(yaw_range[0], yaw_range[1]) + commands[i * steps : (i + 1) * steps] = np.array([x, y, yaw]) + +# Export to txt +np.savetxt("commands.txt", commands, fmt="%.3f") From 3006c4c8ef54e32cbe83d1334905f8b9b5f853d4 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 15 Jul 2024 13:47:57 -0400 Subject: [PATCH 29/42] evaluate rewards and store in dataframe, added stand still/action rate --- gym/envs/mini_cheetah/minimalist_cheetah.py | 69 ++++++++++- scripts/eval_rewards.py | 126 ++++++++++++++++++++ scripts/finetune.py | 100 +++++++++------- 3 files changed, 249 insertions(+), 46 deletions(-) create mode 100644 scripts/eval_rewards.py diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index c87db2e5..bb67fb69 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -6,10 +6,20 @@ class MinimalistCheetah: Helper class for computing mini cheetah rewards """ - def __init__(self, device="cpu", tracking_sigma=0.25): + def __init__(self, device="cpu", dt=0.01, tracking_sigma=0.25): self.device = device + self.dt = dt self.tracking_sigma = tracking_sigma + # Default joint angles from mini_cheetah_config.py + self.default_dof_pos = torch.tensor( + [0.0, -0.785398, 1.596976], device=self.device + ).repeat(4) + + # Previous 2 dof pos targets + self.dof_target_prev = None + self.dof_target_prev2 = None + def set_states( self, base_height, @@ -17,8 +27,11 @@ def set_states( base_ang_vel, proj_gravity, commands, + dof_pos_obs, + dof_vel, phase_obs, grf, + dof_pos_target, ): # Unsqueeze so first dim is batch_size self.base_height = torch.tensor(base_height, device=self.device).unsqueeze(0) @@ -26,20 +39,36 @@ def set_states( self.base_ang_vel = torch.tensor(base_ang_vel, device=self.device).unsqueeze(0) self.proj_gravity = torch.tensor(proj_gravity, device=self.device).unsqueeze(0) self.commands = torch.tensor(commands, device=self.device).unsqueeze(0) + self.dof_pos_obs = torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0) + self.dof_vel = torch.tensor(dof_vel, device=self.device).unsqueeze(0) self.grf = torch.tensor(grf, device=self.device).unsqueeze(0) + # Compute phase sin phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0) self.phase_sin = phase_obs[:, 0].unsqueeze(0) + # Set targets + self.dof_pos_target = torch.tensor( + dof_pos_target, device=self.device + ).unsqueeze(0) + if self.dof_target_prev is None: + self.dof_target_prev = self.dof_pos_target + if self.dof_target_prev2 is None: + self.dof_target_prev2 = self.dof_target_prev + + def post_process(self): + self.dof_target_prev2 = self.dof_target_prev + self.dof_target_prev = self.dof_pos_target + def _sqrdexp(self, x, scale=1.0): """shorthand helper for squared exponential""" return torch.exp(-torch.square(x / scale) / self.tracking_sigma) - def _switch(self, scale=0.5): + def _switch(self, scale=0.1): # TODO: Check scale, RS commands are scaled differently than QGym c_vel = torch.linalg.norm(self.commands, dim=1) return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - scale)) / scale + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / scale ) def _reward_min_base_height(self, target_height=0.3, scale=0.3): @@ -73,11 +102,41 @@ def _reward_swing_grf(self, contact_thresh=0.5): in_contact = torch.gt(self.grf, contact_thresh) ph_off = torch.gt(self.phase_sin, 0) # phase <= pi rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return -torch.sum(rew.float(), dim=1) # * (1 - self._switch()) + return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) def _reward_stance_grf(self, contact_thresh=0.5): """Reward non-zero grf during stance (pi to 2pi)""" in_contact = torch.gt(self.grf, contact_thresh) ph_off = torch.lt(self.phase_sin, 0) # phase >= pi rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return torch.sum(rew.float(), dim=1) # * (1 - self._switch()) + return torch.sum(rew.float(), dim=1) * (1 - self._switch()) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos_obs) / torch.pi * 36), + dim=1, + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = 0 # TODO: This seems too noisy + # rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + # rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return (rew_vel + rew_pos - rew_base_vel) * self._switch() + + def _reward_action_rate(self): + """Penalize changes in actions""" + # TODO: check this + error = torch.square(self.dof_pos_target - self.dof_target_prev) / self.dt**2 + return -torch.sum(error, dim=1) + + def _reward_action_rate2(self): + """Penalize changes in actions""" + # TODO: check this + error = ( + torch.square( + self.dof_pos_target - 2 * self.dof_target_prev + self.dof_target_prev2 + ) + / self.dt + ) + return -torch.sum(error, dim=1) diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py new file mode 100644 index 00000000..15b287f3 --- /dev/null +++ b/scripts/eval_rewards.py @@ -0,0 +1,126 @@ +from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah + +from gym import LEGGED_GYM_ROOT_DIR + +import os +import torch +import scipy.io +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" +LOAD_RUN = "Jul13_01-49-59_PPO32_S16" +LOG_FILE = "PPO_701_a001.mat" +REWARDS_FILE = "rewards_a001.csv" + +ITERATION = 701 # load this iteration + +PLOT = True +PLOT_N = 1000 # number of steps to plot + +# Data struct fields from Robot-Software logs +DATA_LIST = [ + "header", + "base_height", # 1 shape: (1, data_length) + # the following are all shape: (data_length, n) + "base_lin_vel", # 2 + "base_ang_vel", # 3 + "projected_gravity", # 4 + "commands", # 5 + "dof_pos_obs", # 6 + "dof_vel", # 7 + "phase_obs", # 8 + "grf", # 9 + "dof_pos_target", # 10 + "exploration_noise", # 11 + "footer", +] + +# Finetune for these rewards instead of the ones in the cfg +REWARD_WEIGHTS = { + "min_base_height": 1.5, + # "tracking_lin_vel": 4.0, + # "tracking_ang_vel": 2.0, + "orientation": 1.0, + "swing_grf": 1.0, + "stance_grf": 1.0, + "stand_still": 1.0, + "action_rate": 0.001, + "action_rate2": 0.001, +} + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def get_rewards(data_struct): + data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) + + mini_cheetah = MinimalistCheetah(device=DEVICE) + rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()} # for plotting + rewards_all = torch.empty(0).to(DEVICE) + + for i in range(data_length): + mini_cheetah.set_states( + base_height=data_struct[1][:, i], # shape (1, data_length) + base_lin_vel=data_struct[2][i], + base_ang_vel=data_struct[3][i], + proj_gravity=data_struct[4][i], + commands=data_struct[5][i], + dof_pos_obs=data_struct[6][i], + dof_vel=data_struct[7][i], + phase_obs=data_struct[8][i], + grf=data_struct[9][i], + dof_pos_target=data_struct[10][i], + ) + total_rewards = 0 + for name, weight in REWARD_WEIGHTS.items(): + reward = weight * eval(f"mini_cheetah._reward_{name}()") + rewards_dict[name].append(reward.item()) + total_rewards += reward + rewards_all = torch.cat((rewards_all, total_rewards), dim=0) + # Post process mini cheetah + mini_cheetah.post_process() + + rewards_dict["total"] = rewards_all.tolist() + + # Save rewards in dataframe + rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) + if not os.path.exists(rewards_path): + rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) + else: + rewards_df = pd.read_csv(rewards_path) + for name, rewards in rewards_dict.items(): + rewards = np.array(rewards) + mean = rewards.mean() + std = rewards.std() + rewards_df = rewards_df._append( + { + "iteration": ITERATION, + "type": name, + "mean": mean, + "std": std, + }, + ignore_index=True, + ) + if PLOT: + plt.plot(rewards[:PLOT_N], label=name) + rewards_df.to_csv(rewards_path, index=False) + print(rewards_df) + + +def setup(name="SMOOTH_RL_CONTROLLER"): + path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE) + data = scipy.io.loadmat(path) + data_struct = data[name][0][0] + return data_struct + + +if __name__ == "__main__": + data_struct = setup() + get_rewards(data_struct) + + if PLOT: + plt.title("Rewards") + plt.legend() + plt.show() diff --git a/scripts/finetune.py b/scripts/finetune.py index 41c24209..f4cb44c7 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -12,35 +12,33 @@ import torch import scipy.io import numpy as np +import pandas as pd from tensordict import TensorDict -import matplotlib.pyplot as plt ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -LOAD_RUN = "Jul11_17-55-01_PPO32_S16_grf3" -LOG_FILE = "PPO32_S16_grf3_expl1.mat" +LOAD_RUN = "Jul13_01-49-59_PPO32_S16" +LOG_FILE = "PPO_701_a001_expl08.mat" +REWARDS_FILE = "rewards_test.csv" TRAJ_LENGTH = 100 # split data into chunks of this length -EXPLORATION_SCALE = 1 # scale std in SmoothActor -ITERATION = 700 # load this iteration - -PLOT = True -PLOT_N = 300 # number of steps to plot +EXPLORATION_SCALE = 0.8 # scale std in SmoothActor +ITERATION = 701 # load this iteration # Data struct fields from Robot-Software logs DATA_LIST = [ "header", - "base_height", # shape (1, data_length) - # the following are all shape (data_length, n) - "base_lin_vel", - "base_ang_vel", - "projected_gravity", - "commands", - "dof_pos_obs", - "dof_vel", - "phase_obs", - "grf", - "dof_pos_target", - "exploration_noise", + "base_height", # 1 shape: (1, data_length) + # the following are all shape: (data_length, n) + "base_lin_vel", # 2 + "base_ang_vel", # 3 + "projected_gravity", # 4 + "commands", # 5 + "dof_pos_obs", # 6 + "dof_vel", # 7 + "phase_obs", # 8 + "grf", # 9 + "dof_pos_target", # 10 + "exploration_noise", # 11 "footer", ] @@ -49,9 +47,12 @@ "min_base_height": 1.5, # "tracking_lin_vel": 4.0, # "tracking_ang_vel": 2.0, - # "orientation": 1.0, - "swing_grf": 1.5, - "stance_grf": 1.5, + "orientation": 1.0, + "swing_grf": 1.0, + "stance_grf": 1.0, + "stand_still": 1.0, + "action_rate": 0.001, + "action_rate2": 0.001, } DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -71,37 +72,59 @@ def get_obs(obs_list, data_struct, batches): return obs_all.float() -def get_rewards(data_struct, batches): +def get_rewards(data_struct, num_trajs): + mini_cheetah = MinimalistCheetah(device=DEVICE) rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()} # for plotting rewards_all = torch.empty(0).to(DEVICE) - for i in range(TRAJ_LENGTH * batches): - mini_cheetah = MinimalistCheetah(device=DEVICE) + + for i in range(TRAJ_LENGTH * num_trajs): mini_cheetah.set_states( base_height=data_struct[1][:, i], # shape (1, data_length) base_lin_vel=data_struct[2][i], base_ang_vel=data_struct[3][i], proj_gravity=data_struct[4][i], commands=data_struct[5][i], + dof_pos_obs=data_struct[6][i], + dof_vel=data_struct[7][i], phase_obs=data_struct[8][i], grf=data_struct[9][i], + dof_pos_target=data_struct[10][i], ) total_rewards = 0 for name, weight in REWARD_WEIGHTS.items(): - reward = eval(f"mini_cheetah._reward_{name}()") + reward = weight * eval(f"mini_cheetah._reward_{name}()") rewards_dict[name].append(reward.item()) - total_rewards += weight * reward + total_rewards += reward rewards_all = torch.cat((rewards_all, total_rewards), dim=0) - rewards_all = rewards_all.reshape((TRAJ_LENGTH, batches)) + # Post process mini cheetah + mini_cheetah.post_process() + + rewards_dict["total"] = rewards_all.tolist() - rewards_stats = {} + # Save rewards in dataframe + rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) + if not os.path.exists(rewards_path): + rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) + else: + rewards_df = pd.read_csv(rewards_path) for name, rewards in rewards_dict.items(): rewards = np.array(rewards) - rewards_stats[name] = [rewards.mean(), rewards.std()] - if PLOT: - plt.plot(rewards[:PLOT_N], label=name) - - print(rewards_stats) + mean = rewards.mean() + std = rewards.std() + rewards_df = rewards_df._append( + { + "iteration": str(ITERATION) + "_expl", + "type": name, + "mean": mean, + "std": std, + }, + ignore_index=True, + ) + rewards_df.to_csv(rewards_path, index=False) + print(rewards_df) + # Return total rewards + rewards_all = rewards_all.reshape((TRAJ_LENGTH, num_trajs)) return rewards_all.float() @@ -183,14 +206,9 @@ def finetune(runner): save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt") runner.save(save_path) - export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_finetuned") + export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_" + str(ITERATION + 1)) runner.export(export_path) - if PLOT: - plt.title("Rewards") - plt.legend() - plt.show() - if __name__ == "__main__": runner = setup() From 27e756849e11fac77c9e9eb7ba8ff94ffe3f043e Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 15 Jul 2024 16:47:32 -0400 Subject: [PATCH 30/42] handle IPG in FintuneRunner, and revert storage to single traj --- learning/runners/finetune_runner.py | 67 +++++++++++++++--- scripts/finetune.py | 104 ++++++++++++++-------------- 2 files changed, 108 insertions(+), 63 deletions(-) diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index 4cfdf895..fdff23b4 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -24,9 +24,25 @@ def _set_up_alg(self): ) else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) - critic = Critic(num_critic_obs, **self.critic_cfg) - alg_class = eval(self.cfg["algorithm_class_name"]) - self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + + alg_class_name = self.cfg["algorithm_class_name"] + alg_class = eval(alg_class_name) + + if alg_class_name == "PPO_IPG": + critic_v = Critic(num_critic_obs, **self.critic_cfg) + critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) + target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) + self.alg = alg_class( + actor, + critic_v, + critic_q, + target_critic_q, + device=self.device, + **self.alg_cfg, + ) + else: + critic = Critic(num_critic_obs, **self.critic_cfg) + self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] @@ -37,9 +53,27 @@ def parse_train_cfg(self, train_cfg): def learn(self): # Single update on data dict - self.alg.update(self.data_dict) + if self.cfg["algorithm_class_name"] == "PPO_IPG": + # TODO: How to handle off-policy data + self.alg.update(self.data_dict, self.data_dict) + else: + self.alg.update(self.data_dict) def save(self, path): + if self.cfg["algorithm_class_name"] == "PPO_IPG": + torch.save( + { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_v_state_dict": self.alg.critic_v.state_dict(), + "critic_q_state_dict": self.alg.critic_q.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(), + "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(), + "iter": 1, # only one iteration + }, + path, + ) + return torch.save( { "actor_state_dict": self.alg.actor.state_dict(), @@ -54,12 +88,25 @@ def save(self, path): def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) - self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) - if load_optimizer: - self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) - self.alg.critic_optimizer.load_state_dict( - loaded_dict["critic_optimizer_state_dict"] - ) + + if self.cfg["algorithm_class_name"] == "PPO_IPG": + self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"]) + self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_v_optimizer.load_state_dict( + loaded_dict["critic_v_opt_state_dict"] + ) + self.alg.critic_q_optimizer.load_state_dict( + loaded_dict["critic_q_opt_state_dict"] + ) + else: + self.alg.critic.load_state_dict(loaded_dict["critic_state_dict"]) + if load_optimizer: + self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) + self.alg.critic_optimizer.load_state_dict( + loaded_dict["critic_optimizer_state_dict"] + ) def export(self, path): # Need to make a copy of actor diff --git a/scripts/finetune.py b/scripts/finetune.py index f4cb44c7..ab97b5bd 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -17,12 +17,20 @@ ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" LOAD_RUN = "Jul13_01-49-59_PPO32_S16" -LOG_FILE = "PPO_701_a001_expl08.mat" +# LOAD_RUN = "Jul12_15-53-57_IPG32_S16" +LOG_FILE = "PPO_700_expl08.mat" +ITERATION = 700 # load this iteration + +# IPG_BUFFER = [ +# "IPG_700_expl08.mat", +# "IPG_701_expl08.mat", +# ] + +LOG_REWARDS = False # whether to log to dataframe REWARDS_FILE = "rewards_test.csv" TRAJ_LENGTH = 100 # split data into chunks of this length EXPLORATION_SCALE = 0.8 # scale std in SmoothActor -ITERATION = 701 # load this iteration # Data struct fields from Robot-Software logs DATA_LIST = [ @@ -58,28 +66,25 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") -def get_obs(obs_list, data_struct, batches): +def get_obs(obs_list, data_struct, batch_size): obs_all = torch.empty(0).to(DEVICE) for obs_name in obs_list: data_idx = DATA_LIST.index(obs_name) obs = torch.tensor(data_struct[data_idx]).to(DEVICE) - if obs.shape[0] == 1: - obs = obs.T # base_height has shape (1, data_length) - obs = obs[: TRAJ_LENGTH * batches, :] - obs = obs.reshape((TRAJ_LENGTH, batches, -1)) + obs = obs.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) obs_all = torch.cat((obs_all, obs), dim=-1) return obs_all.float() -def get_rewards(data_struct, num_trajs): +def get_rewards(data_struct, batch_size): mini_cheetah = MinimalistCheetah(device=DEVICE) rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()} # for plotting rewards_all = torch.empty(0).to(DEVICE) - for i in range(TRAJ_LENGTH * num_trajs): + for i in range(batch_size): mini_cheetah.set_states( - base_height=data_struct[1][:, i], # shape (1, data_length) + base_height=data_struct[1][:, i], # shape (1, batch_size) base_lin_vel=data_struct[2][i], base_ang_vel=data_struct[3][i], proj_gravity=data_struct[4][i], @@ -101,30 +106,28 @@ def get_rewards(data_struct, num_trajs): rewards_dict["total"] = rewards_all.tolist() - # Save rewards in dataframe - rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) - if not os.path.exists(rewards_path): - rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) - else: - rewards_df = pd.read_csv(rewards_path) - for name, rewards in rewards_dict.items(): - rewards = np.array(rewards) - mean = rewards.mean() - std = rewards.std() - rewards_df = rewards_df._append( - { - "iteration": str(ITERATION) + "_expl", - "type": name, - "mean": mean, - "std": std, - }, - ignore_index=True, - ) - rewards_df.to_csv(rewards_path, index=False) - print(rewards_df) + if LOG_REWARDS: + rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) + if not os.path.exists(rewards_path): + rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) + else: + rewards_df = pd.read_csv(rewards_path) + for name, rewards in rewards_dict.items(): + rewards = np.array(rewards) + mean = rewards.mean() + std = rewards.std() + rewards_df = rewards_df._append( + { + "iteration": str(ITERATION) + "_expl", + "type": name, + "mean": mean, + "std": std, + }, + ignore_index=True, + ) + rewards_df.to_csv(rewards_path, index=False) + print(rewards_df) - # Return total rewards - rewards_all = rewards_all.reshape((TRAJ_LENGTH, num_trajs)) return rewards_all.float() @@ -132,16 +135,14 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE) data = scipy.io.loadmat(path) data_struct = data[name][0][0] - - data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) - num_trajs = data_length // TRAJ_LENGTH + batch_size = data_struct[1].shape[1] # base_height: shape (1, batch_size) data_dict = TensorDict( {}, device=DEVICE, - batch_size=(TRAJ_LENGTH - 1, num_trajs), # -1 for next_obs + batch_size=(batch_size - 1, 1), # -1 for next_obs ) - actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, num_trajs) - critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, num_trajs) + actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size) + critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size) data_dict["actor_obs"] = actor_obs[:-1] data_dict["next_actor_obs"] = actor_obs[1:] @@ -149,24 +150,18 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): data_dict["next_critic_obs"] = critic_obs[1:] actions_idx = DATA_LIST.index("dof_pos_target") - actions = torch.tensor(data_struct[actions_idx]).to(DEVICE) - actions = actions[: TRAJ_LENGTH * num_trajs, :] - actions = actions.reshape((TRAJ_LENGTH, num_trajs, -1)) + actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float() + actions = actions.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) data_dict["actions"] = actions[:-1] - rewards = get_rewards(data_struct, num_trajs) + rewards = get_rewards(data_struct, batch_size) + rewards = rewards.reshape((batch_size, 1)) # shape (batch_size, 1) data_dict["rewards"] = rewards[:-1] # Assume no termination - data_dict["timed_out"] = torch.zeros( - TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool - ) - data_dict["terminal"] = torch.zeros( - TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool - ) - data_dict["dones"] = torch.zeros( - TRAJ_LENGTH - 1, num_trajs, device=DEVICE, dtype=bool - ) + data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) return data_dict @@ -179,7 +174,10 @@ def setup(): data_dict = get_data_dict(train_cfg) runner = FineTuneRunner( - train_cfg, data_dict, exploration_scale=EXPLORATION_SCALE, device=DEVICE + train_cfg, + data_dict, + exploration_scale=EXPLORATION_SCALE, + device=DEVICE, ) runner._set_up_alg() From 0a40204aa84698f43ab4076f3965afcfab86ad2c Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Tue, 16 Jul 2024 18:41:39 -0400 Subject: [PATCH 31/42] collect sim data for finetuning, added finetune config --- gym/envs/__init__.py | 7 + .../mini_cheetah_finetune_config.py | 149 ++++++++++++++++++ learning/runners/finetune_runner.py | 127 ++++++++++++++- scripts/finetune.py | 129 +++++++-------- 4 files changed, 348 insertions(+), 64 deletions(-) create mode 100644 gym/envs/mini_cheetah/mini_cheetah_finetune_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 57cbd8c0..0736c973 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -27,6 +27,7 @@ "MiniCheetahCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahFineTuneCfg": ".mini_cheetah.mini_cheetah_finetune_config", "MITHumanoidCfg": ".mit_humanoid.mit_humanoid_config", "A1Cfg": ".a1.a1_config", "AnymalCFlatCfg": ".anymal_c.flat.anymal_c_flat_config", @@ -39,6 +40,7 @@ "MiniCheetahRunnerCfg": ".mini_cheetah.mini_cheetah_config", "MiniCheetahRefRunnerCfg": ".mini_cheetah.mini_cheetah_ref_config", "MiniCheetahOscRunnerCfg": ".mini_cheetah.mini_cheetah_osc_config", + "MiniCheetahFineTuneRunnerCfg": ".mini_cheetah.mini_cheetah_finetune_config", "MITHumanoidRunnerCfg": ".mit_humanoid.mit_humanoid_config", "A1RunnerCfg": ".a1.a1_config", "AnymalCFlatRunnerCfg": ".anymal_c.flat.anymal_c_flat_config", @@ -59,6 +61,11 @@ "MiniCheetahOscCfg", "MiniCheetahOscRunnerCfg", ], + "mini_cheetah_finetune": [ + "MiniCheetahRef", + "MiniCheetahFineTuneCfg", + "MiniCheetahFineTuneRunnerCfg", + ], "humanoid": ["MIT_Humanoid", "MITHumanoidCfg", "MITHumanoidRunnerCfg"], "humanoid_running": [ "HumanoidRunning", diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py new file mode 100644 index 00000000..e07ee45a --- /dev/null +++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py @@ -0,0 +1,149 @@ +from gym.envs.mini_cheetah.mini_cheetah_ref_config import ( + MiniCheetahRefCfg, + MiniCheetahRefRunnerCfg, +) + +BASE_HEIGHT_REF = 0.33 + + +class MiniCheetahFineTuneCfg(MiniCheetahRefCfg): + class env(MiniCheetahRefCfg.env): + num_envs = 1 + num_actuators = 12 + episode_length_s = 30.0 + + class terrain(MiniCheetahRefCfg.terrain): + pass + + class init_state(MiniCheetahRefCfg.init_state): + pass + + class control(MiniCheetahRefCfg.control): + # * PD Drive parameters: + stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} + damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} + gait_freq = 3.0 + ctrl_frequency = 50 + desired_sim_frequency = 500 + + class commands(MiniCheetahRefCfg.commands): + # * time before command are changed[s] + resampling_time = 3.0 + + # * smaller ranges for finetuning + class ranges: + lin_vel_x = [-1.0, 1.5] # min max [m/s] + lin_vel_y = 0.5 # max [m/s] + yaw_vel = 1.5 # max [rad/s] + + class push_robots(MiniCheetahRefCfg.push_robots): + pass + + class domain_rand(MiniCheetahRefCfg.domain_rand): + randomize_friction = False + friction_range = [0.5, 1.0] + randomize_base_mass = True + added_mass_range = [5.0, 5.0] + + class asset(MiniCheetahRefCfg.asset): + pass + + class reward_settings(MiniCheetahRefCfg.reward_settings): + pass + + class scaling(MiniCheetahRefCfg.scaling): + pass + + +class MiniCheetahFineTuneRunnerCfg(MiniCheetahRefRunnerCfg): + seed = -1 + runner_class_name = "OnPolicyRunner" + + class actor: + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + smooth_exploration = True + exploration_sample_freq = 16 + + normalize_obs = True + obs = [ + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + ] + + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + scale = 1.0 + dof_pos_obs = 0.01 + base_ang_vel = 0.01 + dof_pos = 0.005 + dof_vel = 0.005 + lin_vel = 0.05 + ang_vel = [0.3, 0.15, 0.4] + gravity_vec = 0.1 + + class critic: + hidden_dims = [256, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + normalize_obs = True + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "phase_obs", + "dof_pos_target", + ] + + class reward: + class weights: + tracking_lin_vel = 2.0 + tracking_ang_vel = 1.0 + orientation = 1.0 + min_base_height = 1.5 + action_rate = 0.001 + action_rate2 = 0.001 + stand_still = 1.0 + swing_grf = 1.0 + stance_grf = 1.0 + + class termination_weight: + termination = 0.15 + + class algorithm(MiniCheetahRefRunnerCfg.algorithm): + desired_kl = 0.02 # 0.02 for smooth-exploration, else 0.01 + + # IPG + polyak = 0.995 + use_cv = False + inter_nu = 0.5 + beta = "off_policy" + storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs + + # Finetuning + learning_rate = 3e-4 + max_gradient_steps = 5 + + class runner(MiniCheetahRefRunnerCfg.runner): + run_name = "" + experiment_name = "mini_cheetah_ref" + max_iterations = 20 # number of policy updates + algorithm_class_name = "PPO2" + num_steps_per_env = 3000 + + # Finetuning + resume = True + load_run = "Jul13_01-49-59_PPO32_S16" + checkpoint = 700 + save_interval = 5 diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index fdff23b4..69b9869f 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -1,11 +1,25 @@ import torch +from tensordict import TensorDict from learning.algorithms import * # noqa: F403 from learning.modules import Actor, SmoothActor, Critic from learning.utils import remove_zero_weighted_rewards +from learning.storage import DictStorage +from .BaseRunner import BaseRunner -class FineTuneRunner: - def __init__(self, train_cfg, data_dict, exploration_scale=1.0, device="cpu"): +sim_storage = DictStorage() + + +class FineTuneRunner(BaseRunner): + def __init__( + self, + env, + train_cfg, + data_dict, + exploration_scale=1.0, + device="cpu", + ): + self.env = env self.parse_train_cfg(train_cfg) self.data_dict = data_dict self.exploration_scale = exploration_scale @@ -52,6 +66,21 @@ def parse_train_cfg(self, train_cfg): self.critic_cfg = train_cfg["critic"] def learn(self): + self.alg.switch_to_train() + + if self.env is not None: + # Simulate 1 env, same number of steps as in data dict + num_steps = self.data_dict.shape[0] + sim_data = self.get_sim_data(num_steps) + # Concatenate data dict wtih sim data + self.data_dict = TensorDict( + { + name: torch.cat((self.data_dict[name], sim_data[name]), dim=1) + for name in self.data_dict.keys() + }, + batch_size=(num_steps, 2), + ) + # Single update on data dict if self.cfg["algorithm_class_name"] == "PPO_IPG": # TODO: How to handle off-policy data @@ -59,6 +88,100 @@ def learn(self): else: self.alg.update(self.data_dict) + def get_sim_data(self, num_steps): + rewards_dict = {} + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor.sample_weights(batch_size=1) + + # * Start up storage + transition = TensorDict({}, batch_size=1, device=self.device) + transition.update( + { + "actor_obs": actor_obs, + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), + "critic_obs": critic_obs, + "next_critic_obs": critic_obs, + "rewards": self.get_rewards({"termination": 0.0})["termination"], + "dones": self.get_timed_out(), + } + ) + sim_storage.initialize( + transition, + num_envs=1, + max_storage=num_steps, + device=self.device, + ) + + # * Rollout + with torch.inference_mode(): + for i in range(num_steps): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor.sample_weights(batch_size=1) + + actions = self.alg.act(actor_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + + transition.update( + { + "actor_obs": actor_obs, + "actions": actions, + "critic_obs": critic_obs, + } + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + + # * get time_outs + timed_out = self.get_timed_out() + terminated = self.get_terminated() + dones = timed_out | terminated + + self.update_rewards(rewards_dict, terminated) + total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + + transition.update( + { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, + "rewards": total_rewards, + "timed_out": timed_out, + "dones": dones, + } + ) + sim_storage.add_transitions(transition) + + return sim_storage.data + + def update_rewards(self, rewards_dict, terminated): + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + def save(self, path): if self.cfg["algorithm_class_name"] == "PPO_IPG": torch.save( diff --git a/scripts/finetune.py b/scripts/finetune.py index ab97b5bd..42a33739 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -16,21 +16,20 @@ from tensordict import TensorDict ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -LOAD_RUN = "Jul13_01-49-59_PPO32_S16" -# LOAD_RUN = "Jul12_15-53-57_IPG32_S16" -LOG_FILE = "PPO_700_expl08.mat" -ITERATION = 700 # load this iteration +USE_SIMULATOR = False # IPG_BUFFER = [ # "IPG_700_expl08.mat", # "IPG_701_expl08.mat", # ] -LOG_REWARDS = False # whether to log to dataframe +LOG_REWARDS = False REWARDS_FILE = "rewards_test.csv" -TRAJ_LENGTH = 100 # split data into chunks of this length -EXPLORATION_SCALE = 0.8 # scale std in SmoothActor +# Scales +EXPLORATION_SCALE = 0.8 # used during data collection +ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4) +COMMAND_SCALES = np.array([3.0, 1.0, 3.0]) # Data struct fields from Robot-Software logs DATA_LIST = [ @@ -50,20 +49,7 @@ "footer", ] -# Finetune for these rewards instead of the ones in the cfg -REWARD_WEIGHTS = { - "min_base_height": 1.5, - # "tracking_lin_vel": 4.0, - # "tracking_ang_vel": 2.0, - "orientation": 1.0, - "swing_grf": 1.0, - "stance_grf": 1.0, - "stand_still": 1.0, - "action_rate": 0.001, - "action_rate2": 0.001, -} - -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +DEVICE = "cuda" def get_obs(obs_list, data_struct, batch_size): @@ -77,18 +63,18 @@ def get_obs(obs_list, data_struct, batch_size): return obs_all.float() -def get_rewards(data_struct, batch_size): - mini_cheetah = MinimalistCheetah(device=DEVICE) - rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()} # for plotting +def get_rewards(data_struct, reward_weights, batch_size): + minimalist_cheetah = MinimalistCheetah(device=DEVICE) + rewards_dict = {name: [] for name in reward_weights.keys()} # for plotting rewards_all = torch.empty(0).to(DEVICE) for i in range(batch_size): - mini_cheetah.set_states( + minimalist_cheetah.set_states( base_height=data_struct[1][:, i], # shape (1, batch_size) base_lin_vel=data_struct[2][i], base_ang_vel=data_struct[3][i], proj_gravity=data_struct[4][i], - commands=data_struct[5][i], + commands=data_struct[5][i] * COMMAND_SCALES, dof_pos_obs=data_struct[6][i], dof_vel=data_struct[7][i], phase_obs=data_struct[8][i], @@ -96,43 +82,47 @@ def get_rewards(data_struct, batch_size): dof_pos_target=data_struct[10][i], ) total_rewards = 0 - for name, weight in REWARD_WEIGHTS.items(): - reward = weight * eval(f"mini_cheetah._reward_{name}()") + for name, weight in reward_weights.items(): + reward = weight * eval(f"minimalist_cheetah._reward_{name}()") rewards_dict[name].append(reward.item()) total_rewards += reward rewards_all = torch.cat((rewards_all, total_rewards), dim=0) # Post process mini cheetah - mini_cheetah.post_process() + minimalist_cheetah.post_process() rewards_dict["total"] = rewards_all.tolist() - if LOG_REWARDS: - rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) - if not os.path.exists(rewards_path): - rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) - else: - rewards_df = pd.read_csv(rewards_path) - for name, rewards in rewards_dict.items(): - rewards = np.array(rewards) - mean = rewards.mean() - std = rewards.std() - rewards_df = rewards_df._append( - { - "iteration": str(ITERATION) + "_expl", - "type": name, - "mean": mean, - "std": std, - }, - ignore_index=True, - ) - rewards_df.to_csv(rewards_path, index=False) - print(rewards_df) - - return rewards_all.float() + return rewards_all.float(), rewards_dict + + +def log_rewards(rewards_dict, path, checkpoint): + if not os.path.exists(path): + rewards_df = pd.DataFrame(columns=["checkpoint", "type", "mean", "std"]) + else: + rewards_df = pd.read_csv(path) + for name, rewards in rewards_dict.items(): + rewards = np.array(rewards) + mean = rewards.mean() + std = rewards.std() + rewards_df = rewards_df._append( + { + "checkpoint": str(checkpoint), + "type": name, + "mean": mean, + "std": std, + }, + ignore_index=True, + ) + rewards_df.to_csv(path, index=False) + print(rewards_df) def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): - path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE) + load_run = train_cfg["runner"]["load_run"] + checkpoint = train_cfg["runner"]["checkpoint"] + log_file = str(checkpoint) + ".mat" + + path = os.path.join(ROOT_DIR, load_run, log_file) data = scipy.io.loadmat(path) data_struct = data[name][0][0] batch_size = data_struct[1].shape[1] # base_height: shape (1, batch_size) @@ -154,13 +144,17 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): actions = actions.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) data_dict["actions"] = actions[:-1] - rewards = get_rewards(data_struct, batch_size) + reward_weights = train_cfg["critic"]["reward"]["weights"] + rewards, rewards_dict = get_rewards(data_struct, reward_weights, batch_size) rewards = rewards.reshape((batch_size, 1)) # shape (batch_size, 1) data_dict["rewards"] = rewards[:-1] - # Assume no termination + if LOG_REWARDS: + rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE) + log_rewards(rewards_dict, rewards_path, checkpoint) + + # No time outs and terminations data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) - data_dict["terminal"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) return data_dict @@ -169,11 +163,19 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): def setup(): args = get_args() - _, train_cfg = task_registry.create_cfgs(args) + env_cfg, train_cfg = task_registry.create_cfgs(args) + task_registry.make_gym_and_sim() + + if USE_SIMULATOR: + env = task_registry.make_env(name=args.task, env_cfg=env_cfg) + else: + env = None + train_cfg = class_to_dict(train_cfg) data_dict = get_data_dict(train_cfg) runner = FineTuneRunner( + env, train_cfg, data_dict, exploration_scale=EXPLORATION_SCALE, @@ -185,7 +187,9 @@ def setup(): def finetune(runner): - load_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION) + ".pt") + load_run = runner.cfg["load_run"] + checkpoint = runner.cfg["checkpoint"] + load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt") print("Loading model from: ", load_path) runner.load(load_path) @@ -193,18 +197,19 @@ def finetune(runner): runner.learn() # Compare old and new actions - scaling = torch.tensor([0.2, 0.3, 0.3], device=DEVICE).repeat(4) actions_old = runner.data_dict["actions"] - actions_new = scaling * runner.alg.actor.act_inference( + action_scales = torch.tensor(ACTION_SCALES).to(DEVICE) + actions_new = action_scales * runner.alg.actor.act_inference( runner.data_dict["actor_obs"] ) diff = actions_new - actions_old print("Mean action diff per actuator: ", diff.mean(dim=(0, 1))) + print("Overall mean action diff: ", diff.mean()) - save_path = os.path.join(ROOT_DIR, LOAD_RUN, "model_" + str(ITERATION + 1) + ".pt") + save_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint + 1) + ".pt") runner.save(save_path) - export_path = os.path.join(ROOT_DIR, LOAD_RUN, "exported_" + str(ITERATION + 1)) + export_path = os.path.join(ROOT_DIR, load_run, "exported_" + str(checkpoint + 1)) runner.export(export_path) From b01b0a3b56e8d99dd8a8369f8b0096364331069a Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Tue, 16 Jul 2024 18:48:14 -0400 Subject: [PATCH 32/42] script to finetune multiple runs with robot-software --- scripts/finetune_multiple.sh | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 scripts/finetune_multiple.sh diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh new file mode 100644 index 00000000..81b26056 --- /dev/null +++ b/scripts/finetune_multiple.sh @@ -0,0 +1,49 @@ +#!/bin/bash +source /home/lmolnar/miniconda3/etc/profile.d/conda.sh + +# Args +LOAD_RUN="Jul13_01-49-59_PPO32_S16" +CHECKPOINT=700 +N_RUNS=5 + +# Set directories +QGYM_DIR="/home/lmolnar/workspace/QGym" +QGYM_LOG_DIR="${QGYM_DIR}/logs/mini_cheetah_ref/${LOAD_RUN}" + +RS_DIR="/home/lmolnar/workspace/Robot-Software" +RS_POLICY_DIR="${RS_DIR}/config/systems/quadruped/controllers/policies" + +# Change name of export folder +mv ${QGYM_LOG_DIR}/exported ${QGYM_LOG_DIR}/exported_${CHECKPOINT} + +for i in {1..N_RUNS} +do + # Copy exported files to RS + cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_POLICY_DIR} + + # Store LCM logs in file labeled with checkpoint + FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT} + if [ -f "$FILE" ]; then + rm "$FILE" + fi + + # Run logging script in background + ${RS_DIR}/logging/scripts/run_lcm_logger.sh ${CHECKPOINT} & + PID1=$! + + # Run quadruped script and cancel logging script when done + ${RS_DIR}/build/bin/run_quadruped m s + kill $PID1 + + # Convert logs to .mat and copy to QGym + conda deactivate + conda activate robot-sw + ${RS_DIR}/logging/scripts/sim_data_convert.sh -l + cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR} + + # Finetune in QGym + conda deactivate + conda activate qgym + python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} + CHECKPOINT=$((CHECKPOINT + 1)) +done \ No newline at end of file From 77f5bd48b728b16ca368466819f3829f4bd0a5b0 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Tue, 16 Jul 2024 19:43:17 -0400 Subject: [PATCH 33/42] bugfix finetune script, update rewards evaluation --- scripts/eval_rewards.py | 44 ++++++++++++++++++++---------------- scripts/finetune_multiple.sh | 18 +++++++-------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py index 15b287f3..1e4d6c50 100644 --- a/scripts/eval_rewards.py +++ b/scripts/eval_rewards.py @@ -11,10 +11,8 @@ ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" LOAD_RUN = "Jul13_01-49-59_PPO32_S16" -LOG_FILE = "PPO_701_a001.mat" -REWARDS_FILE = "rewards_a001.csv" - -ITERATION = 701 # load this iteration +REWARDS_FILE = "rewards.csv" +ITERATIONS = range(700, 705) PLOT = True PLOT_N = 1000 # number of steps to plot @@ -37,7 +35,6 @@ "footer", ] -# Finetune for these rewards instead of the ones in the cfg REWARD_WEIGHTS = { "min_base_height": 1.5, # "tracking_lin_vel": 4.0, @@ -50,10 +47,10 @@ "action_rate2": 0.001, } -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +DEVICE = "cuda" -def get_rewards(data_struct): +def get_rewards(it, data_struct, rewards_path): data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) mini_cheetah = MinimalistCheetah(device=DEVICE) @@ -85,7 +82,6 @@ def get_rewards(data_struct): rewards_dict["total"] = rewards_all.tolist() # Save rewards in dataframe - rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) if not os.path.exists(rewards_path): rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) else: @@ -96,7 +92,7 @@ def get_rewards(data_struct): std = rewards.std() rewards_df = rewards_df._append( { - "iteration": ITERATION, + "iteration": it, "type": name, "mean": mean, "std": std, @@ -104,23 +100,31 @@ def get_rewards(data_struct): ignore_index=True, ) if PLOT: + plt.figure(it) plt.plot(rewards[:PLOT_N], label=name) + plt.title(f"Rewards Iteration {it}") + plt.legend() + plt.savefig(f"{ROOT_DIR}/{LOAD_RUN}/rewards_{it}.png") rewards_df.to_csv(rewards_path, index=False) - print(rewards_df) def setup(name="SMOOTH_RL_CONTROLLER"): - path = os.path.join(ROOT_DIR, LOAD_RUN, LOG_FILE) - data = scipy.io.loadmat(path) - data_struct = data[name][0][0] - return data_struct + data_dict = {} + for it in ITERATIONS: + path = os.path.join(ROOT_DIR, LOAD_RUN, f"{it}.mat") + data = scipy.io.loadmat(path) + data_dict[it] = data[name][0][0] + return data_dict if __name__ == "__main__": - data_struct = setup() - get_rewards(data_struct) + data_dict = setup() + rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) + if os.path.exists(rewards_path): + os.remove(rewards_path) + + for it, data_struct in data_dict.items(): + get_rewards(it, data_struct, rewards_path) - if PLOT: - plt.title("Rewards") - plt.legend() - plt.show() + rewards_df = pd.read_csv(rewards_path) + print(rewards_df) diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh index 81b26056..9a214044 100644 --- a/scripts/finetune_multiple.sh +++ b/scripts/finetune_multiple.sh @@ -8,19 +8,16 @@ N_RUNS=5 # Set directories QGYM_DIR="/home/lmolnar/workspace/QGym" -QGYM_LOG_DIR="${QGYM_DIR}/logs/mini_cheetah_ref/${LOAD_RUN}" - RS_DIR="/home/lmolnar/workspace/Robot-Software" -RS_POLICY_DIR="${RS_DIR}/config/systems/quadruped/controllers/policies" -# Change name of export folder +QGYM_LOG_DIR="${QGYM_DIR}/logs/mini_cheetah_ref/${LOAD_RUN}" + +# Change default folder name and copy to RS mv ${QGYM_LOG_DIR}/exported ${QGYM_LOG_DIR}/exported_${CHECKPOINT} +cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_DIR}/config/systems/quadruped/controllers/policies -for i in {1..N_RUNS} +for i in $(seq 1 $N_RUNS) do - # Copy exported files to RS - cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_POLICY_DIR} - # Store LCM logs in file labeled with checkpoint FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT} if [ -f "$FILE" ]; then @@ -38,12 +35,15 @@ do # Convert logs to .mat and copy to QGym conda deactivate conda activate robot-sw - ${RS_DIR}/logging/scripts/sim_data_convert.sh -l + ${RS_DIR}/logging/scripts/sim_data_convert.sh ${CHECKPOINT} cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR} # Finetune in QGym conda deactivate conda activate qgym python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} + + # Copy policy to RS CHECKPOINT=$((CHECKPOINT + 1)) + cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_DIR}/config/systems/quadruped/controllers/policies done \ No newline at end of file From 68fde3ea0751ab0d34b5c1e57d6edae753c01e2c Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 17 Jul 2024 10:38:13 -0400 Subject: [PATCH 34/42] build off-policy buffer for IPG from RS data --- .../mini_cheetah_finetune_config.py | 8 +- learning/runners/finetune_runner.py | 31 ++--- scripts/finetune.py | 120 +++++++++++------- scripts/finetune_multiple.sh | 2 +- 4 files changed, 92 insertions(+), 69 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py index e07ee45a..284fcd0a 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py @@ -57,7 +57,7 @@ class scaling(MiniCheetahRefCfg.scaling): class MiniCheetahFineTuneRunnerCfg(MiniCheetahRefRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "IPGRunner" class actor: hidden_dims = [256, 256, 128] @@ -132,18 +132,18 @@ class algorithm(MiniCheetahRefRunnerCfg.algorithm): storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs # Finetuning - learning_rate = 3e-4 + learning_rate = 1e-4 max_gradient_steps = 5 class runner(MiniCheetahRefRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 20 # number of policy updates - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO_IPG" num_steps_per_env = 3000 # Finetuning resume = True - load_run = "Jul13_01-49-59_PPO32_S16" + load_run = "Jul12_15-53-57_IPG32_S16" checkpoint = 700 save_interval = 5 diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index 69b9869f..064073b3 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -15,20 +15,22 @@ def __init__( self, env, train_cfg, - data_dict, + data_onpol, + data_offpol=None, exploration_scale=1.0, device="cpu", ): self.env = env self.parse_train_cfg(train_cfg) - self.data_dict = data_dict + self.data_onpol = data_onpol + self.data_offpol = data_offpol self.exploration_scale = exploration_scale self.device = device def _set_up_alg(self): - num_actor_obs = self.data_dict["actor_obs"].shape[-1] - num_actions = self.data_dict["actions"].shape[-1] - num_critic_obs = self.data_dict["critic_obs"].shape[-1] + num_actor_obs = self.data_onpol["actor_obs"].shape[-1] + num_actions = self.data_onpol["actions"].shape[-1] + num_critic_obs = self.data_onpol["critic_obs"].shape[-1] if self.actor_cfg["smooth_exploration"]: actor = SmoothActor( num_obs=num_actor_obs, @@ -69,24 +71,23 @@ def learn(self): self.alg.switch_to_train() if self.env is not None: - # Simulate 1 env, same number of steps as in data dict - num_steps = self.data_dict.shape[0] + # Simulate 1 env, same number of steps as in data + num_steps = self.data_onpol.shape[0] sim_data = self.get_sim_data(num_steps) # Concatenate data dict wtih sim data - self.data_dict = TensorDict( + self.data_onpol = TensorDict( { - name: torch.cat((self.data_dict[name], sim_data[name]), dim=1) - for name in self.data_dict.keys() + name: torch.cat((self.data_onpol[name], sim_data[name]), dim=1) + for name in self.data_onpol.keys() }, batch_size=(num_steps, 2), ) - # Single update on data dict - if self.cfg["algorithm_class_name"] == "PPO_IPG": - # TODO: How to handle off-policy data - self.alg.update(self.data_dict, self.data_dict) + # Single alg update on data + if self.data_offpol is None: + self.alg.update(self.data_onpol) else: - self.alg.update(self.data_dict) + self.alg.update(self.data_onpol, self.data_offpol) def get_sim_data(self, num_steps): rewards_dict = {} diff --git a/scripts/finetune.py b/scripts/finetune.py index 42a33739..9bdc7add 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -18,11 +18,6 @@ ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" USE_SIMULATOR = False -# IPG_BUFFER = [ -# "IPG_700_expl08.mat", -# "IPG_701_expl08.mat", -# ] - LOG_REWARDS = False REWARDS_FILE = "rewards_test.csv" @@ -52,25 +47,25 @@ DEVICE = "cuda" -def get_obs(obs_list, data_struct, batch_size): +def get_obs(obs_list, data_struct, data_length): obs_all = torch.empty(0).to(DEVICE) for obs_name in obs_list: data_idx = DATA_LIST.index(obs_name) obs = torch.tensor(data_struct[data_idx]).to(DEVICE) - obs = obs.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) + obs = obs.reshape((data_length, 1, -1)) # shape (data_length, 1, n) obs_all = torch.cat((obs_all, obs), dim=-1) return obs_all.float() -def get_rewards(data_struct, reward_weights, batch_size): +def get_rewards(data_struct, reward_weights, data_length): minimalist_cheetah = MinimalistCheetah(device=DEVICE) rewards_dict = {name: [] for name in reward_weights.keys()} # for plotting rewards_all = torch.empty(0).to(DEVICE) - for i in range(batch_size): + for i in range(data_length): minimalist_cheetah.set_states( - base_height=data_struct[1][:, i], # shape (1, batch_size) + base_height=data_struct[1][:, i], # shape (1, data_length) base_lin_vel=data_struct[2][i], base_ang_vel=data_struct[3][i], proj_gravity=data_struct[4][i], @@ -117,45 +112,63 @@ def log_rewards(rewards_dict, path, checkpoint): print(rewards_df) -def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER"): +def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False): load_run = train_cfg["runner"]["load_run"] checkpoint = train_cfg["runner"]["checkpoint"] - log_file = str(checkpoint) + ".mat" - - path = os.path.join(ROOT_DIR, load_run, log_file) - data = scipy.io.loadmat(path) - data_struct = data[name][0][0] - batch_size = data_struct[1].shape[1] # base_height: shape (1, batch_size) - data_dict = TensorDict( - {}, - device=DEVICE, - batch_size=(batch_size - 1, 1), # -1 for next_obs - ) - actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, batch_size) - critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, batch_size) - - data_dict["actor_obs"] = actor_obs[:-1] - data_dict["next_actor_obs"] = actor_obs[1:] - data_dict["critic_obs"] = critic_obs[:-1] - data_dict["next_critic_obs"] = critic_obs[1:] - - actions_idx = DATA_LIST.index("dof_pos_target") - actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float() - actions = actions.reshape((batch_size, 1, -1)) # shape (batch_size, 1, n) - data_dict["actions"] = actions[:-1] + run_dir = os.path.join(ROOT_DIR, load_run) - reward_weights = train_cfg["critic"]["reward"]["weights"] - rewards, rewards_dict = get_rewards(data_struct, reward_weights, batch_size) - rewards = rewards.reshape((batch_size, 1)) # shape (batch_size, 1) - data_dict["rewards"] = rewards[:-1] - - if LOG_REWARDS: - rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE) - log_rewards(rewards_dict, rewards_path, checkpoint) + if offpol: + log_files = [file for file in os.listdir(run_dir) if file.endswith(".mat")] + log_files = sorted(log_files) + else: + # Single log file + log_files = [str(checkpoint) + ".mat"] - # No time outs and terminations - data_dict["timed_out"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) - data_dict["dones"] = torch.zeros(batch_size - 1, 1, device=DEVICE, dtype=bool) + # Initialize data dict + data = scipy.io.loadmat(os.path.join(run_dir, log_files[0])) + data_struct = data[name][0][0] + data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) + batch_size = (data_length - 1, len(log_files)) # -1 for next_obs + data_dict = TensorDict({}, device=DEVICE, batch_size=batch_size) + + # Get all data + actor_obs_all = torch.empty(0).to(DEVICE) + critic_obs_all = torch.empty(0).to(DEVICE) + actions_all = torch.empty(0).to(DEVICE) + rewards_all = torch.empty(0).to(DEVICE) + for log in log_files: + data = scipy.io.loadmat(os.path.join(run_dir, log)) + data_struct = data[name][0][0] + + actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, data_length) + critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, data_length) + actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1) + critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1) + + actions_idx = DATA_LIST.index("dof_pos_target") + actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float() + actions = actions.reshape((data_length, 1, -1)) # shape (data_length, 1, n) + actions_all = torch.cat((actions_all, actions), dim=1) + + reward_weights = train_cfg["critic"]["reward"]["weights"] + rewards, rewards_dict = get_rewards(data_struct, reward_weights, data_length) + rewards = rewards.reshape((data_length, 1)) # shape (data_length, 1) + rewards_all = torch.cat((rewards_all, rewards), dim=1) + + if LOG_REWARDS: + rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE) + log_rewards(rewards_dict, rewards_path, checkpoint) + + data_dict["actor_obs"] = actor_obs_all[:-1] + data_dict["next_actor_obs"] = actor_obs_all[1:] + data_dict["critic_obs"] = critic_obs_all[:-1] + data_dict["next_critic_obs"] = critic_obs_all[1:] + data_dict["actions"] = actions_all[:-1] + data_dict["rewards"] = rewards_all[:-1] + + # No time outs and dones + data_dict["timed_out"] = torch.zeros(batch_size, device=DEVICE, dtype=bool) + data_dict["dones"] = torch.zeros(batch_size, device=DEVICE, dtype=bool) return data_dict @@ -172,12 +185,21 @@ def setup(): env = None train_cfg = class_to_dict(train_cfg) - data_dict = get_data_dict(train_cfg) + data_onpol = get_data_dict(train_cfg, offpol=False) + + if train_cfg["runner"]["algorithm_class_name"] == "PPO_IPG": + data_offpol = get_data_dict(train_cfg, offpol=True) + else: + data_offpol = None + + print(data_onpol) + print(data_offpol) runner = FineTuneRunner( env, train_cfg, - data_dict, + data_onpol, + data_offpol, exploration_scale=EXPLORATION_SCALE, device=DEVICE, ) @@ -197,10 +219,10 @@ def finetune(runner): runner.learn() # Compare old and new actions - actions_old = runner.data_dict["actions"] + actions_old = runner.data_onpol["actions"] action_scales = torch.tensor(ACTION_SCALES).to(DEVICE) actions_new = action_scales * runner.alg.actor.act_inference( - runner.data_dict["actor_obs"] + runner.data_onpol["actor_obs"] ) diff = actions_new - actions_old print("Mean action diff per actuator: ", diff.mean(dim=(0, 1))) diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh index 9a214044..fa81966f 100644 --- a/scripts/finetune_multiple.sh +++ b/scripts/finetune_multiple.sh @@ -2,7 +2,7 @@ source /home/lmolnar/miniconda3/etc/profile.d/conda.sh # Args -LOAD_RUN="Jul13_01-49-59_PPO32_S16" +LOAD_RUN="Jul12_15-53-57_IPG32_S16" CHECKPOINT=700 N_RUNS=5 From 5e56d54fb543b3feb9da3889b25e1ce445b71207 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 17 Jul 2024 11:24:31 -0400 Subject: [PATCH 35/42] remove rewards logging in finetune.py --- scripts/finetune.py | 37 ++----------------------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index 9bdc7add..51f165cf 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -12,14 +12,11 @@ import torch import scipy.io import numpy as np -import pandas as pd from tensordict import TensorDict ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" USE_SIMULATOR = False -LOG_REWARDS = False -REWARDS_FILE = "rewards_test.csv" # Scales EXPLORATION_SCALE = 0.8 # used during data collection @@ -60,7 +57,6 @@ def get_obs(obs_list, data_struct, data_length): def get_rewards(data_struct, reward_weights, data_length): minimalist_cheetah = MinimalistCheetah(device=DEVICE) - rewards_dict = {name: [] for name in reward_weights.keys()} # for plotting rewards_all = torch.empty(0).to(DEVICE) for i in range(data_length): @@ -79,37 +75,12 @@ def get_rewards(data_struct, reward_weights, data_length): total_rewards = 0 for name, weight in reward_weights.items(): reward = weight * eval(f"minimalist_cheetah._reward_{name}()") - rewards_dict[name].append(reward.item()) total_rewards += reward rewards_all = torch.cat((rewards_all, total_rewards), dim=0) # Post process mini cheetah minimalist_cheetah.post_process() - rewards_dict["total"] = rewards_all.tolist() - - return rewards_all.float(), rewards_dict - - -def log_rewards(rewards_dict, path, checkpoint): - if not os.path.exists(path): - rewards_df = pd.DataFrame(columns=["checkpoint", "type", "mean", "std"]) - else: - rewards_df = pd.read_csv(path) - for name, rewards in rewards_dict.items(): - rewards = np.array(rewards) - mean = rewards.mean() - std = rewards.std() - rewards_df = rewards_df._append( - { - "checkpoint": str(checkpoint), - "type": name, - "mean": mean, - "std": std, - }, - ignore_index=True, - ) - rewards_df.to_csv(path, index=False) - print(rewards_df) + return rewards_all.float() def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False): @@ -151,14 +122,10 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False): actions_all = torch.cat((actions_all, actions), dim=1) reward_weights = train_cfg["critic"]["reward"]["weights"] - rewards, rewards_dict = get_rewards(data_struct, reward_weights, data_length) + rewards = get_rewards(data_struct, reward_weights, data_length) rewards = rewards.reshape((data_length, 1)) # shape (data_length, 1) rewards_all = torch.cat((rewards_all, rewards), dim=1) - if LOG_REWARDS: - rewards_path = os.path.join(ROOT_DIR, load_run, REWARDS_FILE) - log_rewards(rewards_dict, rewards_path, checkpoint) - data_dict["actor_obs"] = actor_obs_all[:-1] data_dict["next_actor_obs"] = actor_obs_all[1:] data_dict["critic_obs"] = critic_obs_all[:-1] From 1c70ce564c6a4e2d84a34b374ce8b8f610a79664 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 17 Jul 2024 18:31:12 -0400 Subject: [PATCH 36/42] merge lm/SAC_plotting branch, visualize PPO and IPG critics --- gym/envs/base/fixed_robot.py | 97 ++++++++---------- gym/envs/base/fixed_robot_config.py | 52 +++++----- gym/envs/cartpole/cartpole_config.py | 2 +- gym/envs/pendulum/pendulum.py | 78 +++++++++++---- gym/envs/pendulum/pendulum_config.py | 77 +++++++++------ learning/modules/lqrc/plotting.py | 142 +++++++++++++++++++++++++++ learning/runners/ipg_runner.py | 26 +++-- learning/runners/on_policy_runner.py | 23 +++-- scripts/visualize_ipg.py | 124 +++++++++++++++++++++++ scripts/visualize_ppo.py | 102 +++++++++++++++++++ 10 files changed, 575 insertions(+), 148 deletions(-) create mode 100644 learning/modules/lqrc/plotting.py create mode 100644 scripts/visualize_ipg.py create mode 100644 scripts/visualize_ppo.py diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 1fe4c5d5..6d9dc971 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -40,64 +40,60 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): self.reset() def step(self): - """Apply actions, simulate, call self.post_physics_step() - and pre_physics_step() - - Args: - actions (torch.Tensor): Tensor of shape - (num_envs, num_actions_per_env) - """ - self._reset_buffers() - self._pre_physics_step() - # * step physics and render each frame + self._pre_decimation_step() self._render() for _ in range(self.cfg.control.decimation): + self._pre_compute_torques() self.torques = self._compute_torques() - - if self.cfg.asset.disable_motors: - self.torques[:] = 0.0 - torques_to_gym_tensor = torch.zeros( - self.num_envs, self.num_dof, device=self.device - ) - - # todo encapsulate - next_torques_idx = 0 - for dof_idx in range(self.num_dof): - if self.cfg.control.actuated_joints_mask[dof_idx]: - torques_to_gym_tensor[:, dof_idx] = self.torques[ - :, next_torques_idx - ] - next_torques_idx += 1 - else: - torques_to_gym_tensor[:, dof_idx] = torch.zeros( - self.num_envs, device=self.device - ) - - self.gym.set_dof_actuation_force_tensor( - self.sim, gymtorch.unwrap_tensor(torques_to_gym_tensor) - ) - self.gym.simulate(self.sim) - if self.device == "cpu": - self.gym.fetch_results(self.sim, True) - self.gym.refresh_dof_state_tensor(self.sim) - - self._post_physics_step() + self._post_compute_torques() + self._step_physx_sim() + self._post_physx_step() + self._post_decimation_step() self._check_terminations_and_timeouts() env_ids = self.to_be_reset.nonzero(as_tuple=False).flatten() self._reset_idx(env_ids) - def _pre_physics_step(self): - pass + def _pre_decimation_step(self): + return None + + def _pre_compute_torques(self): + return None + + def _post_compute_torques(self): + if self.cfg.asset.disable_motors: + self.torques[:] = 0.0 - def _post_physics_step(self): + def _step_physx_sim(self): + next_torques_idx = 0 + torques_to_gym_tensor = torch.zeros( + self.num_envs, self.num_dof, device=self.device + ) + for dof_idx in range(self.num_dof): + if self.cfg.control.actuated_joints_mask[dof_idx]: + torques_to_gym_tensor[:, dof_idx] = self.torques[:, next_torques_idx] + next_torques_idx += 1 + else: + torques_to_gym_tensor[:, dof_idx] = torch.zeros( + self.num_envs, device=self.device + ) + self.gym.set_dof_actuation_force_tensor( + self.sim, gymtorch.unwrap_tensor(self.torques) + ) + self.gym.simulate(self.sim) + if self.device == "cpu": + self.gym.fetch_results(self.sim, True) + self.gym.refresh_dof_state_tensor(self.sim) + + def _post_physx_step(self): """ check terminations, compute observations and rewards """ self.gym.refresh_actor_root_state_tensor(self.sim) self.gym.refresh_net_contact_force_tensor(self.sim) + def _post_decimation_step(self): self.episode_length_buf += 1 self.common_step_counter += 1 @@ -212,18 +208,6 @@ def _process_rigid_body_props(self, props, env_id): return props def _compute_torques(self): - """Compute torques from actions. - Actions can be interpreted as position or velocity targets given - to a PD controller, or directly as scaled torques. - [NOTE]: torques must have the same dimension as the number of DOFs, - even if some DOFs are not actuated. - - Args: - actions (torch.Tensor): Actions - - Returns: - [torch.Tensor]: Torques sent to the simulation - """ actuated_dof_pos = torch.zeros( self.num_envs, self.num_actuators, device=self.device ) @@ -415,10 +399,7 @@ def _init_buffers(self): self.default_act_pos = self.default_act_pos.unsqueeze(0) # * store indices of actuated joints self.act_idx = to_torch(actuated_idx, dtype=torch.long, device=self.device) - # * check that init range highs and lows are consistent - # * and repopulate to match - if self.cfg.init_state.reset_mode == "reset_to_range": - self.initialize_ranges_for_initial_conditions() + self.initialize_ranges_for_initial_conditions() def initialize_ranges_for_initial_conditions(self): self.dof_pos_range = torch.zeros( diff --git a/gym/envs/base/fixed_robot_config.py b/gym/envs/base/fixed_robot_config.py index 8c7427f8..a9bdef47 100644 --- a/gym/envs/base/fixed_robot_config.py +++ b/gym/envs/base/fixed_robot_config.py @@ -123,34 +123,33 @@ class FixedRobotCfgPPO(BaseConfig): class logging: enable_local_saving = True - class policy: + class actor: init_noise_std = 1.0 hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - # only for 'ActorCriticRecurrent': - # rnn_type = 'lstm' - # rnn_hidden_size = 512 - # rnn_num_layers = 1 - obs = [ "observation_a", "observation_b", "these_need_to_be_atributes_(states)_of_the_robot_env", ] - - critic_obs = [ - "observation_x", - "observation_y", - "critic_obs_can_be_the_same_or_different_than_actor_obs", - ] + normalize_obs = True actions = ["tau_ff"] disable_actions = False class noise: - noise = 0.1 # implement as needed, also in your robot class + observation_a = 0.1 # implement as needed, also in your robot class + + class critic: + hidden_dims = [512, 256, 128] + activation = "elu" + normalize_obs = True + obs = [ + "observation_x", + "observation_y", + "critic_obs_can_be_the_same_or_different_than_actor_obs", + ] class rewards: class weights: @@ -165,20 +164,25 @@ class termination_weight: termination = 0.0 class algorithm: - # * training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 10 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + clip_param = 0.2 - entropy_coef = 0.01 - num_learning_epochs = 5 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.99 - lam = 0.95 desired_kl = 0.01 - max_grad_norm = 1.0 class runner: policy_class_name = "ActorCritic" diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index f18d5a3b..6e073d5c 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -67,7 +67,7 @@ class CartpoleRunnerCfg(FixedRobotCfgPPO): seed = -1 runner_class_name = "OnPolicyRunner" - class policy(FixedRobotCfgPPO.policy): + class actor(FixedRobotCfgPPO.actor): init_noise_std = 1.0 num_layers = 2 num_units = 32 diff --git a/gym/envs/pendulum/pendulum.py b/gym/envs/pendulum/pendulum.py index 95af1fb2..d0bcfb9c 100644 --- a/gym/envs/pendulum/pendulum.py +++ b/gym/envs/pendulum/pendulum.py @@ -1,15 +1,49 @@ import torch +import numpy as np +from math import sqrt from gym.envs.base.fixed_robot import FixedRobot class Pendulum(FixedRobot): - def _post_physics_step(self): - """Update all states that are not handled in PhysX""" - super()._post_physics_step() + def _init_buffers(self): + super()._init_buffers() + self.dof_pos_obs = torch.zeros(self.num_envs, 2, device=self.device) + + def _post_decimation_step(self): + super()._post_decimation_step() + self.dof_pos_obs = torch.cat([self.dof_pos.sin(), self.dof_pos.cos()], dim=1) + + def _reset_system(self, env_ids): + super()._reset_system(env_ids) + self.dof_pos_obs[env_ids] = torch.cat( + [self.dof_pos[env_ids].sin(), self.dof_pos[env_ids].cos()], dim=1 + ) + + def _check_terminations_and_timeouts(self): + super()._check_terminations_and_timeouts() + self.terminated = self.timed_out + + def reset_to_uniform(self, env_ids): + grid_points = int(sqrt(self.num_envs)) + lin_pos = torch.linspace( + self.dof_pos_range[0, 0], + self.dof_pos_range[0, 1], + grid_points, + device=self.device, + ) + lin_vel = torch.linspace( + self.dof_vel_range[0, 0], + self.dof_vel_range[0, 1], + grid_points, + device=self.device, + ) + grid = torch.cartesian_prod(lin_pos, lin_vel) + self.dof_pos[env_ids] = grid[:, 0].unsqueeze(-1) + self.dof_vel[env_ids] = grid[:, 1].unsqueeze(-1) def _reward_theta(self): - theta_rwd = torch.cos(self.dof_pos[:, 0]) / self.scales["dof_pos"] + theta_rwd = torch.cos(self.dof_pos[:, 0]) # no scaling return self._sqrdexp(theta_rwd.squeeze(dim=-1)) def _reward_omega(self): @@ -17,27 +51,31 @@ def _reward_omega(self): return self._sqrdexp(omega_rwd.squeeze(dim=-1)) def _reward_equilibrium(self): - error = torch.abs(self.dof_state) - error[:, 0] /= self.scales["dof_pos"] - error[:, 1] /= self.scales["dof_vel"] - return self._sqrdexp(torch.mean(error, dim=1), scale=0.01) - # return torch.exp( - # -error.pow(2).sum(dim=1) / self.cfg.reward_settings.tracking_sigma - # ) - - def _reward_torques(self): - """Penalize torques""" - return self._sqrdexp(torch.mean(torch.square(self.torques), dim=1), scale=0.2) + theta_norm = self._normalize_theta() + omega = self.dof_vel[:, 0] + error = torch.stack( + [theta_norm / self.scales["dof_pos"], omega / self.scales["dof_vel"]], dim=1 + ) + return self._sqrdexp(torch.mean(error, dim=1), scale=0.2) def _reward_energy(self): - m_pendulum = 1.0 - l_pendulum = 1.0 kinetic_energy = ( - 0.5 * m_pendulum * l_pendulum**2 * torch.square(self.dof_vel[:, 0]) + 0.5 + * self.cfg.asset.mass + * self.cfg.asset.length**2 + * torch.square(self.dof_vel[:, 0]) ) potential_energy = ( - m_pendulum * 9.81 * l_pendulum * torch.cos(self.dof_pos[:, 0]) + self.cfg.asset.mass + * 9.81 + * self.cfg.asset.length + * torch.cos(self.dof_pos[:, 0]) ) - desired_energy = m_pendulum * 9.81 * l_pendulum + desired_energy = self.cfg.asset.mass * 9.81 * self.cfg.asset.length energy_error = kinetic_energy + potential_energy - desired_energy return self._sqrdexp(energy_error / desired_energy) + + def _normalize_theta(self): + # normalize to range [-pi, pi] + theta = self.dof_pos[:, 0] + return ((theta + np.pi) % (2 * np.pi)) - np.pi diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 920356cd..399060a2 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -5,9 +5,9 @@ class PendulumCfg(FixedRobotCfg): class env(FixedRobotCfg.env): - num_envs = 2**13 - num_actuators = 1 # 1 for theta connecting base and pole - episode_length_s = 5.0 + num_envs = 4096 + num_actuators = 1 + episode_length_s = 10 class terrain(FixedRobotCfg.terrain): pass @@ -18,12 +18,12 @@ class viewer: lookat = [0.0, 0.0, 0.0] # [m] class init_state(FixedRobotCfg.init_state): - default_joint_angles = {"theta": 0.0} # -torch.pi / 2.0} + default_joint_angles = {"theta": 0} # -torch.pi / 2.0} # * default setup chooses how the initial conditions are chosen. # * "reset_to_basic" = a single position # * "reset_to_range" = uniformly random from a range defined below - reset_mode = "reset_to_range" + reset_mode = "reset_to_uniform" # * initial conditions for reset_to_range dof_pos_range = { @@ -33,8 +33,8 @@ class init_state(FixedRobotCfg.init_state): class control(FixedRobotCfg.control): actuated_joints_mask = [1] # angle - ctrl_frequency = 100 - desired_sim_frequency = 200 + ctrl_frequency = 10 + desired_sim_frequency = 100 stiffness = {"theta": 0.0} # [N*m/rad] damping = {"theta": 0.0} # [N*m*s/rad] @@ -44,6 +44,8 @@ class asset(FixedRobotCfg.asset): disable_gravity = False disable_motors = False # all torques set to 0 joint_damping = 0.1 + mass = 1.0 + length = 1.0 class reward_settings(FixedRobotCfg.reward_settings): tracking_sigma = 0.25 @@ -57,15 +59,18 @@ class scaling(FixedRobotCfg.scaling): class PendulumRunnerCfg(FixedRobotCfgPPO): seed = -1 - runner_class_name = "DataLoggingRunner" + runner_class_name = "IPGRunner" class actor: hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" + smooth_exploration = False + exploration_sample_freq = 16 + normalize_obs = False obs = [ - "dof_pos", + "dof_pos_obs", "dof_vel", ] @@ -77,21 +82,22 @@ class noise: dof_vel = 0.0 class critic: - obs = [ - "dof_pos", - "dof_vel", - ] hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" - standard_critic_nn = True + + normalize_obs = False + obs = [ + "dof_pos_obs", + "dof_vel", + ] class reward: class weights: theta = 0.0 omega = 0.0 equilibrium = 1.0 - energy = 0.0 + energy = 0.5 dof_vel = 0.0 torques = 0.025 @@ -99,27 +105,36 @@ class termination_weight: termination = 0.0 class algorithm(FixedRobotCfgPPO.algorithm): - # training params - value_loss_coef = 1.0 - use_clipped_value_loss = True + # both + gamma = 0.95 + # discount_horizon = 2.0 + lam = 0.98 + # shared + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**16 # new clip_param = 0.2 + learning_rate = 1.0e-4 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor entropy_coef = 0.01 - num_learning_epochs = 6 - # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 4 - learning_rate = 1.0e-3 schedule = "fixed" # could be adaptive, fixed - discount_horizon = 2.0 # [s] - lam = 0.98 - # GAE_bootstrap_horizon = .0 # [s] desired_kl = 0.01 - max_grad_norm = 1.0 - standard_loss = True - plus_c_penalty = 0.1 + + # IPG + polyak = 0.995 + use_cv = False + inter_nu = 0.2 + beta = "off_policy" + storage_size = 4 * 100 * 4096 # num_policies*num_steps*num_envs class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" - max_iterations = 500 # number of policy updates - algorithm_class_name = "PPO2" - num_steps_per_env = 32 + max_iterations = 100 # number of policy updates + algorithm_class_name = "PPO_IPG" + num_steps_per_env = 100 + save_interval = 20 diff --git a/learning/modules/lqrc/plotting.py b/learning/modules/lqrc/plotting.py new file mode 100644 index 00000000..dcc913c9 --- /dev/null +++ b/learning/modules/lqrc/plotting.py @@ -0,0 +1,142 @@ +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +from matplotlib.colors import ListedColormap + +import numpy as np + + +def create_custom_bwr_colormap(): + # Define the colors for each segment + dark_blue = [0, 0, 0.5, 1] + light_blue = [0.5, 0.5, 1, 1] + white = [1, 1, 1, 1] + light_red = [1, 0.5, 0.5, 1] + dark_red = [0.5, 0, 0, 1] + + # Number of bins for each segment + n_bins = 128 + mid_band = 5 + + # Create the colormap segments + blue_segment = np.linspace(dark_blue, light_blue, n_bins // 2) + white_segment = np.tile(white, (mid_band, 1)) + red_segment = np.linspace(light_red, dark_red, n_bins // 2) + + # Stack segments to create the full colormap + colors = np.vstack((blue_segment, white_segment, red_segment)) + custom_bwr = ListedColormap(colors, name="custom_bwr") + + return custom_bwr + + +def plot_pendulum_critics_with_data( + x, + predictions, + targets, + actions, + title, + fn, + data, + colorbar_label="f(x)", + grid_size=64, +): + num_critics = len(x.keys()) + fig, axes = plt.subplots(nrows=2, ncols=num_critics, figsize=(4 * num_critics, 6)) + + # Determine global min and max error for consistent scaling + global_min_error = float("inf") + global_max_error = float("-inf") + global_min_prediction = float("inf") + global_max_prediction = float("-inf") + prediction_cmap = mpl.cm.get_cmap("viridis") + action_cmap = mpl.cm.get_cmap("bwr") + + for critic_name in x: + np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) + np_targets = ( + targets["Ground Truth MC Returns"].detach().cpu().numpy().reshape(-1) + ) + np_error = np_predictions - np_targets + global_min_error = min(global_min_error, np.min(np_error)) + global_max_error = max(global_max_error, np.max(np_error)) + global_min_prediction = min( + global_min_prediction, np.min(np_predictions), np.min(np_targets) + ) + global_max_prediction = max( + global_max_prediction, np.max(np_predictions), np.max(np_targets) + ) + prediction_norm = mcolors.CenteredNorm( + vcenter=(global_max_prediction + global_min_prediction) / 2, + halfrange=(global_max_prediction - global_min_prediction) / 2, + ) + action_norm = mcolors.CenteredNorm() + + xcord = np.linspace(-2 * np.pi, 2 * np.pi, grid_size) + ycord = np.linspace(-5, 5, grid_size) + + for idx, critic_name in enumerate(x): + np_predictions = predictions[critic_name].detach().cpu().numpy().reshape(-1) + np_targets = ( + targets["Ground Truth MC Returns"].detach().cpu().numpy().reshape(-1) + ) + np_error = np_predictions - np_targets + + # Plot Predictions + axes[0, idx].imshow( + np_predictions.reshape(grid_size, grid_size).T, + origin="lower", + extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), + cmap=prediction_cmap, + norm=prediction_norm, + ) + axes[0, idx].set_title(f"{critic_name} Prediction") + + # Plot MC Trajectories + data = data.detach().cpu().numpy() + theta = data[:, :, 0] + omega = data[:, :, 1] + axes[1, 0].plot(theta, omega, lw=1) + axes[1, 0].set_xlabel("theta") + axes[1, 0].set_ylabel("theta_dot") + fig.suptitle(title, fontsize=16) + + # Plot Actions + np_actions = actions.detach().cpu().numpy().reshape(-1) + axes[1, 1].imshow( + np_actions.reshape(grid_size, grid_size).T, + origin="lower", + extent=(xcord.min(), xcord.max(), ycord.min(), ycord.max()), + cmap=action_cmap, + norm=action_norm, + ) + axes[1, 1].set_title("Actions") + ax1_mappable = mpl.cm.ScalarMappable(norm=action_norm, cmap=action_cmap) + + # Last axis empty + axes[1, 2].axis("off") + + # Ensure the axes are the same for all plots + for ax in axes.flat: + ax.set_xlim([xcord.min(), xcord.max()]) + ax.set_ylim([ycord.min(), ycord.max()]) + + plt.subplots_adjust( + top=0.9, bottom=0.1, left=0.1, right=0.9, hspace=0.4, wspace=0.3 + ) + + fig.colorbar( + mpl.cm.ScalarMappable(norm=prediction_norm, cmap=prediction_cmap), + ax=axes[0, :].ravel().tolist(), + shrink=0.95, + label=colorbar_label, + ) + fig.colorbar( + ax1_mappable, + ax=axes[1, :].ravel().tolist(), + shrink=0.95, + label=colorbar_label, + ) + + plt.savefig(f"{fn}.png") + print(f"Saved to {fn}.png") diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py index bb8bb044..4b7b5dfe 100644 --- a/learning/runners/ipg_runner.py +++ b/learning/runners/ipg_runner.py @@ -59,7 +59,7 @@ def learn(self): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() + # self.save() # * Initialize smooth exploration matrices if self.actor_cfg["smooth_exploration"]: @@ -76,6 +76,8 @@ def learn(self): "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) storage_onpol.initialize( @@ -144,6 +146,8 @@ def learn(self): "rewards": total_rewards, "timed_out": timed_out, "dones": dones, + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) # add transition to both storages @@ -157,18 +161,19 @@ def learn(self): logger.tic("learning") self.alg.update(storage_onpol.data, storage_offpol.get_data()) - storage_onpol.clear() # only clear on-policy storage logger.toc("learning") - logger.log_all_categories() + if self.it % self.save_interval == 0: + self.save() + storage_onpol.clear() # only clear on-policy storage + + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") logger.toc("runtime") logger.print_to_terminal() - if self.it % self.save_interval == 0: - self.save() - self.save() + self.save(end=True) @torch.no_grad def burn_in_normalization(self, n_iterations=100): @@ -227,7 +232,7 @@ def set_up_logger(self): (self.alg.actor, self.alg.critic_v, self.alg.critic_q) ) - def save(self): + def save(self, end=False): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( @@ -242,6 +247,13 @@ def save(self): }, path, ) + # Save data + path_offpol = os.path.join(self.log_dir, "data_offpol_{}".format(self.it)) + torch.save(storage_offpol.get_data().cpu(), path_offpol + ".pt") + # When training ends the on-policy data was cleared + if not end: + path_onpol = os.path.join(self.log_dir, "data_onpol_{}".format(self.it)) + torch.save(storage_onpol.data.cpu(), path_onpol + ".pt") def load(self, path, load_optimizer=True, load_actor_std=True): loaded_dict = torch.load(path) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index c3465433..6823a6d4 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -30,7 +30,7 @@ def learn(self, states_to_log_dict=None): actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations - self.save() + # self.save() # * Initialize smooth exploration matrices if self.actor_cfg["smooth_exploration"]: @@ -47,6 +47,8 @@ def learn(self, states_to_log_dict=None): "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) storage.initialize( @@ -116,6 +118,8 @@ def learn(self, states_to_log_dict=None): "rewards": total_rewards, "timed_out": timed_out, "dones": dones, + "dof_pos": self.env.dof_pos, + "dof_vel": self.env.dof_vel, } ) storage.add_transitions(transition) @@ -127,18 +131,19 @@ def learn(self, states_to_log_dict=None): logger.tic("learning") self.alg.update(storage.data) - storage.clear() logger.toc("learning") - logger.log_all_categories() + if self.it % self.save_interval == 0: + self.save() + storage.clear() + + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") logger.toc("runtime") logger.print_to_terminal() - if self.it % self.save_interval == 0: - self.save() - self.save() + self.save(end=True) @torch.no_grad def burn_in_normalization(self, n_iterations=100): @@ -185,7 +190,7 @@ def set_up_logger(self): logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) - def save(self): + def save(self, end=False): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( @@ -198,6 +203,10 @@ def save(self): }, path, ) + # Save data if training hasn't ended, otherwise storage was cleared + if not end: + path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) + torch.save(storage.data.cpu(), path_data + ".pt") def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) diff --git a/scripts/visualize_ipg.py b/scripts/visualize_ipg.py new file mode 100644 index 00000000..993f4e0d --- /dev/null +++ b/scripts/visualize_ipg.py @@ -0,0 +1,124 @@ +import os +import shutil +import torch +import numpy as np +import matplotlib.pyplot as plt + +from learning.modules.actor import Actor +from learning.modules.critic import Critic + +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, +) +from learning.modules.lqrc.plotting import plot_pendulum_critics_with_data +from gym import LEGGED_GYM_ROOT_DIR + +DEVICE = "cpu" + +# * Setup +LOAD_RUN = "Jul17_17-22-37_IPG_nu1" +TITLE = "IPG nu=1.0" +IT_RANGE = range(20, 101, 20) + +RUN_DIR = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "pendulum", LOAD_RUN) +PLOT_DIR = os.path.join(RUN_DIR, "visualize_critic") +os.makedirs(PLOT_DIR, exist_ok=True) + +# * V-Critic, Q-Critic and Actor +critic_q = Critic( + num_obs=4, hidden_dims=[128, 64, 32], activation="tanh", normalize_obs=False +).to(DEVICE) +critic_v = Critic( + num_obs=3, hidden_dims=[128, 64, 32], activation="tanh", normalize_obs=False +).to(DEVICE) +actor = Actor( + num_obs=3, + num_actions=1, + hidden_dims=[128, 64, 32], + activation="tanh", + normalize_obs=False, +).to(DEVICE) + +# * Params +n_envs = 4096 # that were trained with +gamma = 0.95 +lam = 1.0 +episode_steps = 100 +visualize_steps = 10 # just to show rollouts +n_trajs = 64 +rand_perm = torch.randperm(n_envs) +traj_idx = rand_perm[0:n_trajs] +test_idx = rand_perm[n_trajs : n_trajs + 1000] + +for it in IT_RANGE: + # load data + base_data = torch.load(os.path.join(RUN_DIR, "data_onpol{}.pt".format(it))).to( + DEVICE + ) + data = base_data.detach().clone() + + dof_pos = base_data["dof_pos"].detach().clone() + dof_vel = base_data["dof_vel"].detach().clone() + graphing_obs = torch.cat((dof_pos, dof_vel), dim=2) + + # compute ground-truth + graphing_data = { + data_name: {} for data_name in ["obs", "values", "returns", "actions"] + } + + episode_rollouts = compute_MC_returns(base_data, gamma) + print(f"Initializing value offset to: {episode_rollouts.mean().item()}") + graphing_data["obs"]["Ground Truth MC Returns"] = graphing_obs[0, :] + graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + + # load models + model = torch.load(os.path.join(RUN_DIR, "model_{}.pt".format(it))) + critic_v.load_state_dict(model["critic_v_state_dict"]) + critic_q.load_state_dict(model["critic_q_state_dict"]) + actor.load_state_dict(model["actor_state_dict"]) + + # V-critic values and returns + data["values"] = critic_v.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic_v) + data["returns"] = data["advantages"] + data["values"] + + with torch.no_grad(): + graphing_data["obs"]["V-Critic"] = graphing_obs[0, :] + graphing_data["values"]["V-Critic"] = critic_v.evaluate( + data[0, :]["critic_obs"] + ) + graphing_data["returns"]["V-Critic"] = data[0, :]["returns"] + graphing_data["actions"] = actor(data[0, :]["actor_obs"]) + + # Q-critic values and returns + actions = actor(data["actor_obs"]) + critic_q_obs = torch.cat((data["critic_obs"], actions), dim=2) + data["values"] = critic_q.evaluate(critic_q_obs) + data["next_critic_obs"] = critic_q_obs # needed for GAE + data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic_q) + data["returns"] = data["advantages"] + data["values"] + + with torch.no_grad(): + graphing_data["obs"]["Q-Critic"] = graphing_obs[0, :] + graphing_data["values"]["Q-Critic"] = critic_q.evaluate(critic_q_obs[0, :]) + graphing_data["returns"]["Q-Critic"] = data[0, :]["returns"] + + # generate plots + grid_size = int(np.sqrt(n_envs)) + plot_pendulum_critics_with_data( + x=graphing_data["obs"], + predictions=graphing_data["values"], + targets=graphing_data["returns"], + actions=graphing_data["actions"], + title=f"{TITLE} Iteration {it}", + fn=PLOT_DIR + f"/IPG_it{it}", + data=graphing_obs[:visualize_steps, traj_idx], + grid_size=grid_size, + ) + + plt.close() + +this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_ipg.py") +shutil.copy(this_file, os.path.join(PLOT_DIR, os.path.basename(this_file))) diff --git a/scripts/visualize_ppo.py b/scripts/visualize_ppo.py new file mode 100644 index 00000000..61cf123a --- /dev/null +++ b/scripts/visualize_ppo.py @@ -0,0 +1,102 @@ +import os +import shutil +import torch +import numpy as np +import matplotlib.pyplot as plt + +from learning.modules.actor import Actor +from learning.modules.critic import Critic + +from learning.utils import ( + compute_generalized_advantages, + compute_MC_returns, +) +from learning.modules.lqrc.plotting import plot_pendulum_critics_with_data +from gym import LEGGED_GYM_ROOT_DIR + +DEVICE = "cpu" + +# * Setup +LOAD_RUN = "Jul17_15-39-53_PPO" +TITLE = "PPO" +IT_RANGE = range(20, 101, 20) + +RUN_DIR = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "pendulum", LOAD_RUN) +PLOT_DIR = os.path.join(RUN_DIR, "visualize_critic") +os.makedirs(PLOT_DIR, exist_ok=True) + +# * Critic and Actor +critic = Critic( + num_obs=3, hidden_dims=[128, 64, 32], activation="tanh", normalize_obs=False +).to(DEVICE) +actor = Actor( + num_obs=3, + num_actions=1, + hidden_dims=[128, 64, 32], + activation="tanh", + normalize_obs=False, +).to(DEVICE) + +# * Params +n_envs = 4096 # that were trained with +gamma = 0.95 +lam = 1.0 +visualize_steps = 10 # just to show rollouts +n_trajs = 64 +rand_perm = torch.randperm(n_envs) +traj_idx = rand_perm[0:n_trajs] +test_idx = rand_perm[n_trajs : n_trajs + 1000] + +for it in IT_RANGE: + # load data + base_data = torch.load(os.path.join(RUN_DIR, "data_{}.pt".format(it))).to(DEVICE) + data = base_data.detach().clone() + + dof_pos = base_data["dof_pos"].detach().clone() + dof_vel = base_data["dof_vel"].detach().clone() + graphing_obs = torch.cat((dof_pos, dof_vel), dim=2) + + # compute ground-truth + graphing_data = { + data_name: {} for data_name in ["obs", "values", "returns", "actions"] + } + + episode_rollouts = compute_MC_returns(base_data, gamma) + print(f"Initializing value offset to: {episode_rollouts.mean().item()}") + graphing_data["obs"]["Ground Truth MC Returns"] = graphing_obs[0, :] + graphing_data["values"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + graphing_data["returns"]["Ground Truth MC Returns"] = episode_rollouts[0, :] + + # load models + model = torch.load(os.path.join(RUN_DIR, "model_{}.pt".format(it))) + critic.load_state_dict(model["critic_state_dict"]) + actor.load_state_dict(model["actor_state_dict"]) + + # compute values and returns + data["values"] = critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages(data, gamma, lam, critic) + data["returns"] = data["advantages"] + data["values"] + + with torch.no_grad(): + graphing_data["obs"]["V-Critic"] = graphing_obs[0, :] + graphing_data["values"]["V-Critic"] = critic.evaluate(data[0, :]["critic_obs"]) + graphing_data["returns"]["V-Critic"] = data[0, :]["returns"] + graphing_data["actions"]["V-Critic"] = actor(data[0, :]["actor_obs"]) + + # generate plots + grid_size = int(np.sqrt(n_envs)) + plot_pendulum_critics_with_data( + x=graphing_data["obs"], + predictions=graphing_data["values"], + targets=graphing_data["returns"], + actions=graphing_data["actions"], + title=f"{TITLE} Iteration {it}", + fn=PLOT_DIR + f"/PPO_it{it}", + data=graphing_obs[:visualize_steps, traj_idx], + grid_size=grid_size, + ) + + plt.close() + +this_file = os.path.join(LEGGED_GYM_ROOT_DIR, "scripts", "visualize_ppo.py") +shutil.copy(this_file, os.path.join(PLOT_DIR, os.path.basename(this_file))) From 43dec750a5ea6d92f702eb9543e842566a8ff2ea Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 19 Jul 2024 17:05:24 -0400 Subject: [PATCH 37/42] finetune IPG with sim on-policy and hardware off-policy, rewards increase! --- .../mini_cheetah_finetune_config.py | 27 +++--- gym/envs/mini_cheetah/minimalist_cheetah.py | 2 +- learning/runners/finetune_runner.py | 44 ++++----- scripts/eval_rewards.py | 89 +++++++++++++++---- scripts/finetune.py | 77 ++++++++++------ 5 files changed, 156 insertions(+), 83 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py index 284fcd0a..32db3fb2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py @@ -8,7 +8,7 @@ class MiniCheetahFineTuneCfg(MiniCheetahRefCfg): class env(MiniCheetahRefCfg.env): - num_envs = 1 + num_envs = 1024 num_actuators = 12 episode_length_s = 30.0 @@ -23,7 +23,7 @@ class control(MiniCheetahRefCfg.control): stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} gait_freq = 3.0 - ctrl_frequency = 50 + ctrl_frequency = 100 desired_sim_frequency = 500 class commands(MiniCheetahRefCfg.commands): @@ -42,7 +42,7 @@ class push_robots(MiniCheetahRefCfg.push_robots): class domain_rand(MiniCheetahRefCfg.domain_rand): randomize_friction = False friction_range = [0.5, 1.0] - randomize_base_mass = True + randomize_base_mass = False added_mass_range = [5.0, 5.0] class asset(MiniCheetahRefCfg.asset): @@ -108,15 +108,15 @@ class critic: class reward: class weights: - tracking_lin_vel = 2.0 - tracking_ang_vel = 1.0 + tracking_lin_vel = 4.0 + tracking_ang_vel = 2.0 orientation = 1.0 min_base_height = 1.5 - action_rate = 0.001 + stand_still = 2.0 + swing_grf = 3.0 + stance_grf = 3.0 + action_rate = 0.01 action_rate2 = 0.001 - stand_still = 1.0 - swing_grf = 1.0 - stance_grf = 1.0 class termination_weight: termination = 0.15 @@ -129,18 +129,21 @@ class algorithm(MiniCheetahRefRunnerCfg.algorithm): use_cv = False inter_nu = 0.5 beta = "off_policy" - storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs + storage_size = 4 * 32 * 4096 # num_policies*num_steps*num_envs # Finetuning + clip_param = 0.2 + max_gradient_steps = 4 + batch_size = 30000 learning_rate = 1e-4 - max_gradient_steps = 5 + schedule = "fixed" class runner(MiniCheetahRefRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" max_iterations = 20 # number of policy updates algorithm_class_name = "PPO_IPG" - num_steps_per_env = 3000 + num_steps_per_env = 200 # Finetuning resume = True diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index bb67fb69..1a8af6bf 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -85,7 +85,7 @@ def _reward_tracking_lin_vel(self): # * scale by (1+|cmd|): if cmd=0, no scaling. error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2])) error = torch.sum(torch.square(error), dim=1) - return torch.exp(-error / self.tracking_sigma) + return torch.exp(-error / self.tracking_sigma) * (1 - self._switch()) def _reward_tracking_ang_vel(self): """Tracking of angular velocity commands (yaw)""" diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index 064073b3..127d95d4 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -20,17 +20,16 @@ def __init__( exploration_scale=1.0, device="cpu", ): - self.env = env - self.parse_train_cfg(train_cfg) self.data_onpol = data_onpol self.data_offpol = data_offpol self.exploration_scale = exploration_scale - self.device = device + # Init calls _set_up_alg which needs exploration scale + super().__init__(env, train_cfg, device) def _set_up_alg(self): - num_actor_obs = self.data_onpol["actor_obs"].shape[-1] - num_actions = self.data_onpol["actions"].shape[-1] - num_critic_obs = self.data_onpol["critic_obs"].shape[-1] + num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) + num_actions = self.get_action_size(self.actor_cfg["actions"]) + num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) if self.actor_cfg["smooth_exploration"]: actor = SmoothActor( num_obs=num_actor_obs, @@ -41,10 +40,8 @@ def _set_up_alg(self): else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) - alg_class_name = self.cfg["algorithm_class_name"] - alg_class = eval(alg_class_name) - - if alg_class_name == "PPO_IPG": + alg_class = eval(self.cfg["algorithm_class_name"]) + if self.data_offpol is not None: critic_v = Critic(num_critic_obs, **self.critic_cfg) critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) @@ -71,16 +68,11 @@ def learn(self): self.alg.switch_to_train() if self.env is not None: - # Simulate 1 env, same number of steps as in data - num_steps = self.data_onpol.shape[0] - sim_data = self.get_sim_data(num_steps) - # Concatenate data dict wtih sim data + # Simulate on-policy data self.data_onpol = TensorDict( - { - name: torch.cat((self.data_onpol[name], sim_data[name]), dim=1) - for name in self.data_onpol.keys() - }, - batch_size=(num_steps, 2), + self.get_sim_data(), + batch_size=(self.num_steps_per_env, self.env.num_envs), + device=self.device, ) # Single alg update on data @@ -89,17 +81,17 @@ def learn(self): else: self.alg.update(self.data_onpol, self.data_offpol) - def get_sim_data(self, num_steps): + def get_sim_data(self): rewards_dict = {} actor_obs = self.get_obs(self.actor_cfg["obs"]) critic_obs = self.get_obs(self.critic_cfg["obs"]) # * Initialize smooth exploration matrices if self.actor_cfg["smooth_exploration"]: - self.alg.actor.sample_weights(batch_size=1) + self.alg.actor.sample_weights(batch_size=self.env.num_envs) # * Start up storage - transition = TensorDict({}, batch_size=1, device=self.device) + transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) transition.update( { "actor_obs": actor_obs, @@ -113,18 +105,18 @@ def get_sim_data(self, num_steps): ) sim_storage.initialize( transition, - num_envs=1, - max_storage=num_steps, + self.env.num_envs, + self.env.num_envs * self.num_steps_per_env, device=self.device, ) # * Rollout with torch.inference_mode(): - for i in range(num_steps): + for i in range(self.num_steps_per_env): # * Re-sample noise matrix for smooth exploration sample_freq = self.actor_cfg["exploration_sample_freq"] if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: - self.alg.actor.sample_weights(batch_size=1) + self.alg.actor.sample_weights(batch_size=self.env.num_envs) actions = self.alg.act(actor_obs) self.set_actions( diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py index 1e4d6c50..0cf3ec68 100644 --- a/scripts/eval_rewards.py +++ b/scripts/eval_rewards.py @@ -10,11 +10,11 @@ import matplotlib.pyplot as plt ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -LOAD_RUN = "Jul13_01-49-59_PPO32_S16" +LOAD_RUN = "Jul12_15-53-57_IPG32_S16" REWARDS_FILE = "rewards.csv" -ITERATIONS = range(700, 705) -PLOT = True +DATAFRAME_LOADED = True +PLOT = False PLOT_N = 1000 # number of steps to plot # Data struct fields from Robot-Software logs @@ -36,14 +36,14 @@ ] REWARD_WEIGHTS = { + "tracking_lin_vel": 4.0, + "tracking_ang_vel": 2.0, "min_base_height": 1.5, - # "tracking_lin_vel": 4.0, - # "tracking_ang_vel": 2.0, "orientation": 1.0, - "swing_grf": 1.0, - "stance_grf": 1.0, - "stand_still": 1.0, - "action_rate": 0.001, + "stand_still": 2.0, + "swing_grf": 3.0, + "stance_grf": 3.0, + "action_rate": 0.01, "action_rate2": 0.001, } @@ -110,21 +110,78 @@ def get_rewards(it, data_struct, rewards_path): def setup(name="SMOOTH_RL_CONTROLLER"): data_dict = {} - for it in ITERATIONS: - path = os.path.join(ROOT_DIR, LOAD_RUN, f"{it}.mat") + log_files = [ + file + for file in os.listdir(os.path.join(ROOT_DIR, LOAD_RUN)) + if file.endswith(".mat") + ] + log_files = sorted(log_files) + for file in log_files: + iteration = int(file.split(".")[0]) + path = os.path.join(ROOT_DIR, LOAD_RUN, file) data = scipy.io.loadmat(path) - data_dict[it] = data[name][0][0] + data_dict[iteration] = data[name][0][0] return data_dict if __name__ == "__main__": data_dict = setup() rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) - if os.path.exists(rewards_path): - os.remove(rewards_path) - for it, data_struct in data_dict.items(): - get_rewards(it, data_struct, rewards_path) + if not DATAFRAME_LOADED: + # Compute rewards and store in dataframe + if os.path.exists(rewards_path): + os.remove(rewards_path) + for it, data_struct in data_dict.items(): + get_rewards(it, data_struct, rewards_path) rewards_df = pd.read_csv(rewards_path) print(rewards_df) + + # Plot rewards stats + num_plots = len(REWARD_WEIGHTS) + 1 # +1 for total rewards + cols = 5 + rows = np.ceil(num_plots / cols).astype(int) + fig, axs = plt.subplots(rows, cols, figsize=(15, 6)) + fig.suptitle("IPG Finetuning Rewards") + axs = axs.flatten() + + for i, key in enumerate(REWARD_WEIGHTS.keys()): + rewards_mean = rewards_df[rewards_df["type"] == key]["mean"].reset_index( + drop=True + ) + rewards_std = rewards_df[rewards_df["type"] == key]["std"].reset_index( + drop=True + ) + axs[i].plot(rewards_mean, label=key) + axs[i].fill_between( + range(len(rewards_mean)), + rewards_mean - rewards_std, + rewards_mean + rewards_std, + color="b", + alpha=0.2, + ) + axs[i].legend() + axs[i].set_xlabel("Iter") + + i = num_plots - 1 + total_mean = rewards_df[rewards_df["type"] == "total"]["mean"].reset_index( + drop=True + ) + total_std = rewards_df[rewards_df["type"] == "total"]["std"].reset_index(drop=True) + axs[i].plot(total_mean, label="total", color="r") + axs[i].fill_between( + range(len(total_mean)), + total_mean - total_std, + total_mean + total_std, + color="r", + alpha=0.2, + ) + axs[i].legend() + axs[i].set_xlabel("Iter") + + for i in range(num_plots, len(axs)): + axs[i].axis("off") + + plt.tight_layout() + plt.show() diff --git a/scripts/finetune.py b/scripts/finetune.py index 51f165cf..ca7d404a 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -15,8 +15,10 @@ from tensordict import TensorDict ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" +OUTPUT_FILE = "output.txt" -USE_SIMULATOR = False +USE_SIMULATOR = True +DATA_LENGTH = 2990 # Robot-Software logs must contain at least this many steps # Scales EXPLORATION_SCALE = 0.8 # used during data collection @@ -44,22 +46,23 @@ DEVICE = "cuda" -def get_obs(obs_list, data_struct, data_length): +def get_obs(obs_list, data_struct): obs_all = torch.empty(0).to(DEVICE) for obs_name in obs_list: data_idx = DATA_LIST.index(obs_name) obs = torch.tensor(data_struct[data_idx]).to(DEVICE) - obs = obs.reshape((data_length, 1, -1)) # shape (data_length, 1, n) + obs = obs.squeeze()[:DATA_LENGTH] + obs = obs.reshape((DATA_LENGTH, 1, -1)) # shape (data_length, 1, n) obs_all = torch.cat((obs_all, obs), dim=-1) return obs_all.float() -def get_rewards(data_struct, reward_weights, data_length): +def get_rewards(data_struct, reward_weights, dt): minimalist_cheetah = MinimalistCheetah(device=DEVICE) rewards_all = torch.empty(0).to(DEVICE) - for i in range(data_length): + for i in range(DATA_LENGTH): minimalist_cheetah.set_states( base_height=data_struct[1][:, i], # shape (1, data_length) base_lin_vel=data_struct[2][i], @@ -75,7 +78,7 @@ def get_rewards(data_struct, reward_weights, data_length): total_rewards = 0 for name, weight in reward_weights.items(): reward = weight * eval(f"minimalist_cheetah._reward_{name}()") - total_rewards += reward + total_rewards += dt * reward # scaled rewards_all = torch.cat((rewards_all, total_rewards), dim=0) # Post process mini cheetah minimalist_cheetah.post_process() @@ -83,23 +86,26 @@ def get_rewards(data_struct, reward_weights, data_length): return rewards_all.float() -def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False): +def get_data_dict(train_cfg, env_cfg, offpol=False, name="SMOOTH_RL_CONTROLLER"): load_run = train_cfg["runner"]["load_run"] checkpoint = train_cfg["runner"]["checkpoint"] run_dir = os.path.join(ROOT_DIR, load_run) if offpol: - log_files = [file for file in os.listdir(run_dir) if file.endswith(".mat")] + # All files up until checkpoint + log_files = [ + file + for file in os.listdir(run_dir) + if file.endswith(".mat") and int(file.split(".")[0]) <= checkpoint + ] log_files = sorted(log_files) else: - # Single log file + # Single log file for checkpoint log_files = [str(checkpoint) + ".mat"] # Initialize data dict data = scipy.io.loadmat(os.path.join(run_dir, log_files[0])) - data_struct = data[name][0][0] - data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) - batch_size = (data_length - 1, len(log_files)) # -1 for next_obs + batch_size = (DATA_LENGTH - 1, len(log_files)) # -1 for next_obs data_dict = TensorDict({}, device=DEVICE, batch_size=batch_size) # Get all data @@ -111,19 +117,22 @@ def get_data_dict(train_cfg, name="SMOOTH_RL_CONTROLLER", offpol=False): data = scipy.io.loadmat(os.path.join(run_dir, log)) data_struct = data[name][0][0] - actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct, data_length) - critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct, data_length) + actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct) + critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct) actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1) critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1) actions_idx = DATA_LIST.index("dof_pos_target") actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float() - actions = actions.reshape((data_length, 1, -1)) # shape (data_length, 1, n) + actions = actions[:DATA_LENGTH] + actions = actions.reshape((DATA_LENGTH, 1, -1)) # shape (data_length, 1, n) actions_all = torch.cat((actions_all, actions), dim=1) reward_weights = train_cfg["critic"]["reward"]["weights"] - rewards = get_rewards(data_struct, reward_weights, data_length) - rewards = rewards.reshape((data_length, 1)) # shape (data_length, 1) + dt = 1.0 / env_cfg["control"]["ctrl_frequency"] + rewards = get_rewards(data_struct, reward_weights, dt) + rewards = rewards[:DATA_LENGTH] + rewards = rewards.reshape((DATA_LENGTH, 1)) # shape (data_length, 1) rewards_all = torch.cat((rewards_all, rewards), dim=1) data_dict["actor_obs"] = actor_obs_all[:-1] @@ -152,16 +161,14 @@ def setup(): env = None train_cfg = class_to_dict(train_cfg) - data_onpol = get_data_dict(train_cfg, offpol=False) + env_cfg = class_to_dict(env_cfg) + data_onpol = get_data_dict(train_cfg, env_cfg, offpol=False) if train_cfg["runner"]["algorithm_class_name"] == "PPO_IPG": - data_offpol = get_data_dict(train_cfg, offpol=True) + data_offpol = get_data_dict(train_cfg, env_cfg, offpol=True) else: data_offpol = None - print(data_onpol) - print(data_offpol) - runner = FineTuneRunner( env, train_cfg, @@ -170,7 +177,6 @@ def setup(): exploration_scale=EXPLORATION_SCALE, device=DEVICE, ) - runner._set_up_alg() return runner @@ -179,7 +185,6 @@ def finetune(runner): load_run = runner.cfg["load_run"] checkpoint = runner.cfg["checkpoint"] load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt") - print("Loading model from: ", load_path) runner.load(load_path) # Perform a single update @@ -192,15 +197,31 @@ def finetune(runner): runner.data_onpol["actor_obs"] ) diff = actions_new - actions_old - print("Mean action diff per actuator: ", diff.mean(dim=(0, 1))) - print("Overall mean action diff: ", diff.mean()) + # Save and export save_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint + 1) + ".pt") - runner.save(save_path) - export_path = os.path.join(ROOT_DIR, load_run, "exported_" + str(checkpoint + 1)) + runner.save(save_path) runner.export(export_path) + # Print to output file + with open(os.path.join(ROOT_DIR, load_run, OUTPUT_FILE), "a") as f: + f.write(f"############ Checkpoint: {checkpoint} #######################\n") + f.write("############### DATA ###############\n") + f.write(f"Data on-policy shape: {runner.data_onpol.shape}\n") + if runner.data_offpol is not None: + f.write(f"Data off-policy shape: {runner.data_offpol.shape}\n") + f.write("############## LOSSES ##############\n") + f.write(f"Mean Value Loss: {runner.alg.mean_value_loss}\n") + f.write(f"Mean Surrogate Loss: {runner.alg.mean_surrogate_loss}\n") + if runner.data_offpol is not None: + f.write(f"Mean Q Loss: {runner.alg.mean_q_loss}\n") + f.write(f"Mean Offpol Loss: {runner.alg.mean_offpol_loss}\n") + f.write("############## ACTIONS #############\n") + f.write(f"Mean action diff per actuator: {diff.mean(dim=(0, 1))}\n") + f.write(f"Std action diff per actuator: {diff.std(dim=(0, 1))}\n") + f.write(f"Overall mean action diff: {diff.mean()}\n") + if __name__ == "__main__": runner = setup() From b004dbbbe0c2b71f3276d9668d8558325daff4e0 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 19 Jul 2024 19:12:18 -0400 Subject: [PATCH 38/42] update reward plots --- scripts/eval_rewards.py | 77 ++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py index 0cf3ec68..0e0de257 100644 --- a/scripts/eval_rewards.py +++ b/scripts/eval_rewards.py @@ -11,11 +11,13 @@ ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" LOAD_RUN = "Jul12_15-53-57_IPG32_S16" -REWARDS_FILE = "rewards.csv" -DATAFRAME_LOADED = True -PLOT = False -PLOT_N = 1000 # number of steps to plot +REWARDS_FILE = "rewards_nu1.csv" # generate this file from logs, if None: just plot +PLOT_REWARDS = { + "Nu=0.5": "rewards_nu05.csv", + "Nu=0.8": "rewards_nu08.csv", + "Nu=1.0": "rewards_nu1.csv", +} # Data struct fields from Robot-Software logs DATA_LIST = [ @@ -99,16 +101,10 @@ def get_rewards(it, data_struct, rewards_path): }, ignore_index=True, ) - if PLOT: - plt.figure(it) - plt.plot(rewards[:PLOT_N], label=name) - plt.title(f"Rewards Iteration {it}") - plt.legend() - plt.savefig(f"{ROOT_DIR}/{LOAD_RUN}/rewards_{it}.png") rewards_df.to_csv(rewards_path, index=False) -def setup(name="SMOOTH_RL_CONTROLLER"): +def get_data_dict(name="SMOOTH_RL_CONTROLLER"): data_dict = {} log_files = [ file @@ -124,28 +120,7 @@ def setup(name="SMOOTH_RL_CONTROLLER"): return data_dict -if __name__ == "__main__": - data_dict = setup() - rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) - - if not DATAFRAME_LOADED: - # Compute rewards and store in dataframe - if os.path.exists(rewards_path): - os.remove(rewards_path) - for it, data_struct in data_dict.items(): - get_rewards(it, data_struct, rewards_path) - - rewards_df = pd.read_csv(rewards_path) - print(rewards_df) - - # Plot rewards stats - num_plots = len(REWARD_WEIGHTS) + 1 # +1 for total rewards - cols = 5 - rows = np.ceil(num_plots / cols).astype(int) - fig, axs = plt.subplots(rows, cols, figsize=(15, 6)) - fig.suptitle("IPG Finetuning Rewards") - axs = axs.flatten() - +def plot_rewards(rewards_df, axs, name): for i, key in enumerate(REWARD_WEIGHTS.keys()): rewards_mean = rewards_df[rewards_df["type"] == key]["mean"].reset_index( drop=True @@ -153,35 +128,59 @@ def setup(name="SMOOTH_RL_CONTROLLER"): rewards_std = rewards_df[rewards_df["type"] == key]["std"].reset_index( drop=True ) - axs[i].plot(rewards_mean, label=key) + axs[i].plot(rewards_mean, label=name) axs[i].fill_between( range(len(rewards_mean)), rewards_mean - rewards_std, rewards_mean + rewards_std, - color="b", alpha=0.2, ) + axs[i].set_title(key) axs[i].legend() - axs[i].set_xlabel("Iter") i = num_plots - 1 total_mean = rewards_df[rewards_df["type"] == "total"]["mean"].reset_index( drop=True ) total_std = rewards_df[rewards_df["type"] == "total"]["std"].reset_index(drop=True) - axs[i].plot(total_mean, label="total", color="r") + axs[i].plot(total_mean, label=name) axs[i].fill_between( range(len(total_mean)), total_mean - total_std, total_mean + total_std, - color="r", alpha=0.2, ) + axs[i].set_title("Total Rewards") axs[i].legend() - axs[i].set_xlabel("Iter") + +if __name__ == "__main__": + if REWARDS_FILE is not None: + data_dict = get_data_dict() + rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) + if os.path.exists(rewards_path): + os.remove(rewards_path) + for it, data_struct in data_dict.items(): + get_rewards(it, data_struct, rewards_path) + + # Plot rewards stats + num_plots = len(REWARD_WEIGHTS) + 1 # +1 for total rewards + cols = 5 + rows = np.ceil(num_plots / cols).astype(int) + fig, axs = plt.subplots(rows, cols, figsize=(30, 10)) + fig.suptitle("IPG Finetuning Rewards") + axs = axs.flatten() + + for name, file in PLOT_REWARDS.items(): + path = os.path.join(ROOT_DIR, LOAD_RUN, file) + rewards_df = pd.read_csv(path) + plot_rewards(rewards_df, axs, name) + + for i in range(num_plots): + axs[i].set_xlabel("Iter") for i in range(num_plots, len(axs)): axs[i].axis("off") plt.tight_layout() + plt.savefig(f"{ROOT_DIR}/{LOAD_RUN}/rewards_stats.png") plt.show() From 36c1c8d0fdadda39f16f0334efb5cc968c88e2ef Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Tue, 23 Jul 2024 10:31:51 -0400 Subject: [PATCH 39/42] integrate SE for finetuning, refactor runner --- .../mini_cheetah_finetune_config.py | 49 +++-- gym/envs/mini_cheetah/mini_cheetah_ref.py | 17 +- .../mini_cheetah/mini_cheetah_ref_config.py | 23 +-- gym/envs/mini_cheetah/minimalist_cheetah.py | 27 ++- learning/algorithms/SE.py | 47 +++++ learning/algorithms/ppo_ipg.py | 6 +- learning/modules/state_estimator.py | 14 +- learning/runners/finetune_runner.py | 186 ++++++++++++++++-- learning/runners/ipg_runner.py | 12 +- learning/runners/on_policy_runner.py | 7 +- scripts/eval_rewards.py | 138 +++++++------ scripts/finetune.py | 139 ++----------- scripts/finetune_multiple.sh | 15 +- scripts/generate_commands.py | 13 +- 14 files changed, 405 insertions(+), 288 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py index 32db3fb2..f4fd30a8 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py @@ -8,7 +8,7 @@ class MiniCheetahFineTuneCfg(MiniCheetahRefCfg): class env(MiniCheetahRefCfg.env): - num_envs = 1024 + num_envs = 4096 num_actuators = 12 episode_length_s = 30.0 @@ -27,23 +27,13 @@ class control(MiniCheetahRefCfg.control): desired_sim_frequency = 500 class commands(MiniCheetahRefCfg.commands): - # * time before command are changed[s] - resampling_time = 3.0 - - # * smaller ranges for finetuning - class ranges: - lin_vel_x = [-1.0, 1.5] # min max [m/s] - lin_vel_y = 0.5 # max [m/s] - yaw_vel = 1.5 # max [rad/s] + pass class push_robots(MiniCheetahRefCfg.push_robots): pass class domain_rand(MiniCheetahRefCfg.domain_rand): - randomize_friction = False - friction_range = [0.5, 1.0] - randomize_base_mass = False - added_mass_range = [5.0, 5.0] + pass class asset(MiniCheetahRefCfg.asset): pass @@ -93,7 +83,9 @@ class critic: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True + + # TODO: Check normalization, SAC/IPG need gradient to pass back through actor + normalize_obs = False obs = [ "base_height", "base_lin_vel", @@ -121,15 +113,32 @@ class weights: class termination_weight: termination = 0.15 + class state_estimator: + class network: + hidden_dims = [128, 128] + activation = "tanh" + dropouts = None + + obs = [ + "base_ang_vel", + "projected_gravity", + "dof_pos_obs", + "dof_vel", + "torques", + "phase_obs", + ] + targets = ["base_height", "base_lin_vel", "grf"] + normalize_obs = True + class algorithm(MiniCheetahRefRunnerCfg.algorithm): desired_kl = 0.02 # 0.02 for smooth-exploration, else 0.01 # IPG polyak = 0.995 use_cv = False - inter_nu = 0.5 + inter_nu = 0.9 beta = "off_policy" - storage_size = 4 * 32 * 4096 # num_policies*num_steps*num_envs + storage_size = 30000 # Finetuning clip_param = 0.2 @@ -143,10 +152,10 @@ class runner(MiniCheetahRefRunnerCfg.runner): experiment_name = "mini_cheetah_ref" max_iterations = 20 # number of policy updates algorithm_class_name = "PPO_IPG" - num_steps_per_env = 200 + num_steps_per_env = 32 # Finetuning resume = True - load_run = "Jul12_15-53-57_IPG32_S16" - checkpoint = 700 - save_interval = 5 + load_run = "Jul23_00-14-23_nu02_B8" + checkpoint = 1000 + save_interval = 1 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index e620a3c2..dbee12c0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -23,6 +23,7 @@ def _init_buffers(self): self.phase_obs = torch.zeros( self.num_envs, 2, dtype=torch.float, device=self.device ) + self.grf = self._compute_grf() def _reset_system(self, env_ids): super()._reset_system(env_ids) @@ -41,18 +42,19 @@ def _post_decimation_step(self): self.phase_obs = torch.cat( (torch.sin(self.phase), torch.cos(self.phase)), dim=1 ) + self.grf = self._compute_grf() def _resample_commands(self, env_ids): super()._resample_commands(env_ids) - # * with 10% chance, reset to 0 commands except for forward + # * with 20% chance, reset to 0 commands except for forward self.commands[env_ids, 1:] *= ( torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) - < 0.9 + < 0.8 ).unsqueeze(1) - # * with 10% chance, reset to 0 commands except for rotation + # * with 20% chance, reset to 0 commands except for rotation self.commands[env_ids, :2] *= ( torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) - < 0.9 + < 0.8 ).unsqueeze(1) # * with 10% chance, reset to 0 self.commands[env_ids, :] *= ( @@ -66,6 +68,13 @@ def _switch(self): -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 ) + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / 80.0, 1.0) + else: + return grf + def _reward_swing_grf(self): """Reward non-zero grf during swing (0 to pi)""" in_contact = torch.gt( diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 153e54ca..34c20416 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "IPGRunner" class actor: hidden_dims = [256, 256, 128] @@ -103,7 +103,9 @@ class critic: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - normalize_obs = True + + # TODO: Check normalization, SAC/IPG need gradient to pass back through actor + normalize_obs = False obs = [ "base_height", "base_lin_vel", @@ -121,15 +123,15 @@ class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 - ang_vel_xy = 0.01 + ang_vel_xy = 0.0 orientation = 1.0 - torques = 5.0e-7 + torques = 0.0 dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 action_rate = 0.01 action_rate2 = 0.001 - stand_still = 2.0 + stand_still = 1.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 @@ -148,16 +150,11 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): use_cv = False inter_nu = 0.2 beta = "off_policy" - storage_size = 4 * 32 * 4096 # num_policies*num_stpes*num_envs - - # Finetuning - # learning_rate = 3e-4 - # max_gradient_steps = 10 - # batch_size = 4096 + storage_size = 8 * 32 * 4096 # num_policies*num_stpes*num_envs class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 700 # number of policy updates - algorithm_class_name = "PPO2" + max_iterations = 1000 # number of policy updates + algorithm_class_name = "PPO_IPG" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index 1a8af6bf..48b7e47a 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -6,16 +6,23 @@ class MinimalistCheetah: Helper class for computing mini cheetah rewards """ - def __init__(self, device="cpu", dt=0.01, tracking_sigma=0.25): + def __init__( + self, device="cpu", tracking_sigma=0.25, ctrl_dt=0.01, ctrl_decimation=5 + ): self.device = device - self.dt = dt self.tracking_sigma = tracking_sigma + # Implemented as in legged robot action rate reward + self.dt = ctrl_dt * ctrl_decimation + # Default joint angles from mini_cheetah_config.py self.default_dof_pos = torch.tensor( [0.0, -0.785398, 1.596976], device=self.device ).repeat(4) + # Scales + self.command_scales = torch.tensor([3.0, 1.0, 3.0]).to(self.device) + # Previous 2 dof pos targets self.dof_target_prev = None self.dof_target_prev2 = None @@ -38,7 +45,10 @@ def set_states( self.base_lin_vel = torch.tensor(base_lin_vel, device=self.device).unsqueeze(0) self.base_ang_vel = torch.tensor(base_ang_vel, device=self.device).unsqueeze(0) self.proj_gravity = torch.tensor(proj_gravity, device=self.device).unsqueeze(0) - self.commands = torch.tensor(commands, device=self.device).unsqueeze(0) + self.commands = ( + torch.tensor(commands, device=self.device).unsqueeze(0) + * self.command_scales + ) self.dof_pos_obs = torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0) self.dof_vel = torch.tensor(dof_vel, device=self.device).unsqueeze(0) self.grf = torch.tensor(grf, device=self.device).unsqueeze(0) @@ -97,14 +107,14 @@ def _reward_orientation(self): error = torch.square(self.proj_gravity[:, :2]) / self.tracking_sigma return torch.sum(torch.exp(-error), dim=1) - def _reward_swing_grf(self, contact_thresh=0.5): + def _reward_swing_grf(self, contact_thresh=50 / 80): """Reward non-zero grf during swing (0 to pi)""" in_contact = torch.gt(self.grf, contact_thresh) ph_off = torch.gt(self.phase_sin, 0) # phase <= pi rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) - def _reward_stance_grf(self, contact_thresh=0.5): + def _reward_stance_grf(self, contact_thresh=50 / 80): """Reward non-zero grf during stance (pi to 2pi)""" in_contact = torch.gt(self.grf, contact_thresh) ph_off = torch.lt(self.phase_sin, 0) # phase >= pi @@ -119,9 +129,8 @@ def _reward_stand_still(self): dim=1, ) rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) - rew_base_vel = 0 # TODO: This seems too noisy - # rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) - # rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) return (rew_vel + rew_pos - rew_base_vel) * self._switch() def _reward_action_rate(self): @@ -137,6 +146,6 @@ def _reward_action_rate2(self): torch.square( self.dof_pos_target - 2 * self.dof_target_prev + self.dof_target_prev2 ) - / self.dt + / self.dt**2 ) return -torch.sum(error, dim=1) diff --git a/learning/algorithms/SE.py b/learning/algorithms/SE.py index 99dfb75f..9d1f2d2d 100644 --- a/learning/algorithms/SE.py +++ b/learning/algorithms/SE.py @@ -3,9 +3,56 @@ from learning.modules import StateEstimatorNN from learning.storage import SERolloutStorage +from learning.utils import create_uniform_generator class StateEstimator: + def __init__( + self, + state_estimator, + normalize_obs=True, + batch_size=2**15, + max_gradient_steps=10, + learning_rate=1e-3, + device="cpu", + **kwargs, + ): + self.device = device + + self.network = state_estimator.to(self.device) + + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps + + self.learning_rate = learning_rate + self.mean_loss = 0.0 + self.optimizer = optim.Adam(self.network.parameters(), lr=learning_rate) + + def update(self, data): + self.mean_loss = 0 + counter = 0 + generator = create_uniform_generator( + data, self.batch_size, self.max_gradient_steps + ) + for batch in generator: + loss = nn.functional.mse_loss( + self.network.evaluate(batch["SE_obs"]), batch["SE_targets"] + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.mean_loss += loss.item() + counter += 1 + self.mean_loss /= counter + + def estimate(self, obs): + return self.network.evaluate(obs) + + def export(self, path): + self.network.export(path) + + +class OldStateEstimator: """This class provides a learned state estimator. This is trained with supervised learning, using only on-policy data collected in a rollout storage. diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index e377579f..e15ab46d 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -107,9 +107,6 @@ def update(self, data_onpol, data_offpol): for param in self.critic_q.parameters(): param.requires_grad = True - # print("On-policy data shape: ", data_onpol.shape) - # print("Off-policy data shape: ", data_offpol.shape) - def update_critic_q(self, data): self.mean_q_loss = 0 counter = 0 @@ -121,7 +118,6 @@ def update_critic_q(self, data): ) for batch in generator: with torch.no_grad(): - # TODO: check that should be inference action_next = self.actor.act_inference(batch["next_actor_obs"]) q_input_next = torch.cat( (batch["next_critic_obs"], action_next), dim=-1 @@ -252,7 +248,7 @@ def update_actor(self, data_onpol, data_offpol): else: b = self.inter_nu - loss = loss_onpol + b * loss_offpol + loss = loss_onpol + b * loss_offpol.requires_grad_() # * Gradient step self.optimizer.zero_grad() diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py index f8cfe221..26a606f7 100644 --- a/learning/modules/state_estimator.py +++ b/learning/modules/state_estimator.py @@ -1,6 +1,5 @@ import torch.nn as nn -from .utils import create_MLP -from .utils import export_network +from .utils import create_MLP, export_network, RunningMeanStd class StateEstimatorNN(nn.Module): @@ -21,6 +20,7 @@ def __init__( hidden_dims=[256, 128], activation="elu", dropouts=None, + normalize_obs=True, **kwargs, ): if kwargs: @@ -30,13 +30,19 @@ def __init__( ) super().__init__() + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_inputs) + self.num_inputs = num_inputs self.num_outputs = num_outputs self.NN = create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts) print(f"State Estimator MLP: {self.NN}") - def evaluate(self, observations): - return self.NN(observations) + def evaluate(self, obs): + if self._normalize_obs: + obs = self.obs_rms(obs) + return self.NN(obs) def export(self, path): export_network(self.NN, "state_estimator", path, self.num_inputs) diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index 127d95d4..bc8a0eeb 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -1,12 +1,15 @@ -import torch -from tensordict import TensorDict from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, SmoothActor, Critic -from learning.utils import remove_zero_weighted_rewards +from learning.algorithms import StateEstimator +from learning.modules import Actor, SmoothActor, Critic, StateEstimatorNN from learning.storage import DictStorage - +from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah from .BaseRunner import BaseRunner +import torch +import os +import scipy.io +from tensordict import TensorDict + sim_storage = DictStorage() @@ -15,17 +18,22 @@ def __init__( self, env, train_cfg, - data_onpol, - data_offpol=None, + log_dir, + data_list, + data_length=2990, + data_name="SMOOTH_RL_CONTROLLER", exploration_scale=1.0, device="cpu", ): - self.data_onpol = data_onpol - self.data_offpol = data_offpol - self.exploration_scale = exploration_scale # Init calls _set_up_alg which needs exploration scale + self.exploration_scale = exploration_scale super().__init__(env, train_cfg, device) + self.log_dir = log_dir + self.data_list = data_list # Describes structure of Robot-Software logs + self.data_length = data_length # Logs must contain at least this many steps + self.data_name = data_name + def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) @@ -40,8 +48,21 @@ def _set_up_alg(self): else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if "state_estimator" in self.train_cfg.keys(): + self.se_cfg = self.train_cfg["state_estimator"] + state_estimator_network = StateEstimatorNN( + self.get_obs_size(self.se_cfg["obs"]), + self.get_obs_size(self.se_cfg["targets"]), + **self.se_cfg["network"], + ) + self.SE = StateEstimator( + state_estimator_network, device=self.device, **self.se_cfg + ) + else: + self.SE = None + alg_class = eval(self.cfg["algorithm_class_name"]) - if self.data_offpol is not None: + if self.cfg["algorithm_class_name"] == "PPO_IPG": critic_v = Critic(num_critic_obs, **self.critic_cfg) critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) @@ -60,9 +81,137 @@ def _set_up_alg(self): def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] - remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) self.actor_cfg = train_cfg["actor"] self.critic_cfg = train_cfg["critic"] + self.train_cfg = train_cfg + + def get_data_dict(self, offpol=False): + checkpoint = self.cfg["checkpoint"] + if offpol: + # All files up until checkpoint + log_files = [ + file + for file in os.listdir(self.log_dir) + if file.endswith(".mat") and int(file.split(".")[0]) <= checkpoint + ] + log_files = sorted(log_files) + else: + # Single log file for checkpoint + log_files = [str(checkpoint) + ".mat"] + + # Initialize data dict + data = scipy.io.loadmat(os.path.join(self.log_dir, log_files[0])) + batch_size = (self.data_length - 1, len(log_files)) # -1 for next_obs + data_dict = TensorDict({}, device=self.device, batch_size=batch_size) + + # Collect all data + actor_obs_all = torch.empty(0).to(self.device) + critic_obs_all = torch.empty(0).to(self.device) + actions_all = torch.empty(0).to(self.device) + rewards_all = torch.empty(0).to(self.device) + for log in log_files: + data = scipy.io.loadmat(os.path.join(self.log_dir, log)) + self.data_struct = data[self.data_name][0][0] + if self.SE: + self.update_state_estimates() + + actor_obs = self.get_data_obs(self.actor_cfg["obs"], self.data_struct) + critic_obs = self.get_data_obs(self.critic_cfg["obs"], self.data_struct) + actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1) + critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1) + + actions_idx = self.data_list.index("dof_pos_target") + actions = ( + torch.tensor(self.data_struct[actions_idx]).to(self.device).float() + ) + actions = actions[: self.data_length] + actions = actions.reshape( + (self.data_length, 1, -1) + ) # shape (data_length, 1, n) + actions_all = torch.cat((actions_all, actions), dim=1) + + reward_weights = self.critic_cfg["reward"]["weights"] + rewards, _ = self.get_data_rewards(self.data_struct, reward_weights) + rewards = rewards[: self.data_length] + rewards = rewards.reshape((self.data_length, 1)) # shape (data_length, 1) + rewards_all = torch.cat((rewards_all, rewards), dim=1) + + data_dict["actor_obs"] = actor_obs_all[:-1] + data_dict["next_actor_obs"] = actor_obs_all[1:] + data_dict["critic_obs"] = critic_obs_all[:-1] + data_dict["next_critic_obs"] = critic_obs_all[1:] + data_dict["actions"] = actions_all[:-1] + data_dict["rewards"] = rewards_all[:-1] + + # No time outs and dones + data_dict["timed_out"] = torch.zeros(batch_size, device=self.device, dtype=bool) + data_dict["dones"] = torch.zeros(batch_size, device=self.device, dtype=bool) + + return data_dict + + def get_data_obs(self, obs_list, data_struct): + obs_all = torch.empty(0).to(self.device) + for obs_name in obs_list: + data_idx = self.data_list.index(obs_name) + obs = torch.tensor(data_struct[data_idx]).to(self.device) + obs = obs.squeeze()[: self.data_length] + obs = obs.reshape((self.data_length, 1, -1)) # shape (data_length, 1, n) + obs_all = torch.cat((obs_all, obs), dim=-1) + + return obs_all.float() + + def get_data_rewards(self, data_struct, reward_weights, dt=0.01): + minimalist_cheetah = MinimalistCheetah(device=self.device) + rewards_dict = {name: [] for name in reward_weights.keys()} # for plotting + rewards_all = torch.empty(0).to(self.device) + + for i in range(self.data_length): + minimalist_cheetah.set_states( + base_height=data_struct[1][i], + base_lin_vel=data_struct[2][i], + base_ang_vel=data_struct[3][i], + proj_gravity=data_struct[4][i], + commands=data_struct[5][i], + dof_pos_obs=data_struct[6][i], + dof_vel=data_struct[7][i], + phase_obs=data_struct[8][i], + grf=data_struct[9][i], + dof_pos_target=data_struct[10][i], + ) + total_rewards = 0 + for name, weight in reward_weights.items(): + reward = weight * eval(f"minimalist_cheetah._reward_{name}()") + rewards_dict[name].append(reward.item()) + total_rewards += reward + rewards_all = torch.cat((rewards_all, total_rewards), dim=0) + # Post process mini cheetah + minimalist_cheetah.post_process() + + rewards_dict["total"] = rewards_all.tolist() + rewards_all *= dt # scaled for alg update + + return rewards_all.float(), rewards_dict + + def update_state_estimates(self): + se_obs = torch.empty(0).to(self.device) + for obs in self.se_cfg["obs"]: + data_idx = self.data_list.index(obs) + data = torch.tensor(self.data_struct[data_idx]).to(self.device) + data = data.squeeze()[: self.data_length] + data = data.reshape((self.data_length, -1)) + se_obs = torch.cat((se_obs, data), dim=-1) + + se_targets = self.SE.estimate(se_obs.float()) + + # Overwrite data struct with state estimates + idx = 0 + for target in self.se_cfg["targets"]: + data_idx = self.data_list.index(target) + dim = self.data_struct[data_idx].shape[1] + self.data_struct[data_idx] = ( + se_targets[:, idx : idx + dim].cpu().detach().numpy() + ) + idx += dim def learn(self): self.alg.switch_to_train() @@ -74,12 +223,15 @@ def learn(self): batch_size=(self.num_steps_per_env, self.env.num_envs), device=self.device, ) + else: + self.data_onpol = self.get_data_dict() # Single alg update on data - if self.data_offpol is None: - self.alg.update(self.data_onpol) - else: + if self.cfg["algorithm_class_name"] == "PPO_IPG": + self.data_offpol = self.get_data_dict(offpol=True) self.alg.update(self.data_onpol, self.data_offpol) + else: + self.alg.update(self.data_onpol) def get_sim_data(self): rewards_dict = {} @@ -224,6 +376,10 @@ def load(self, path, load_optimizer=True): loaded_dict["critic_optimizer_state_dict"] ) + def load_se(self, se_path): + se_dict = torch.load(se_path) + self.SE.network.load_state_dict(se_dict["SE_state_dict"]) + def export(self, path): # Need to make a copy of actor if self.actor_cfg["smooth_exploration"]: diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py index 4b7b5dfe..e46f5665 100644 --- a/learning/runners/ipg_runner.py +++ b/learning/runners/ipg_runner.py @@ -173,7 +173,7 @@ def learn(self): logger.toc("runtime") logger.print_to_terminal() - self.save(end=True) + self.save() @torch.no_grad def burn_in_normalization(self, n_iterations=100): @@ -232,7 +232,7 @@ def set_up_logger(self): (self.alg.actor, self.alg.critic_v, self.alg.critic_q) ) - def save(self, end=False): + def save(self, save_storage=False): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( @@ -247,13 +247,11 @@ def save(self, end=False): }, path, ) - # Save data - path_offpol = os.path.join(self.log_dir, "data_offpol_{}".format(self.it)) - torch.save(storage_offpol.get_data().cpu(), path_offpol + ".pt") - # When training ends the on-policy data was cleared - if not end: + if save_storage: path_onpol = os.path.join(self.log_dir, "data_onpol_{}".format(self.it)) + path_offpol = os.path.join(self.log_dir, "data_offpol_{}".format(self.it)) torch.save(storage_onpol.data.cpu(), path_onpol + ".pt") + torch.save(storage_offpol.get_data().cpu(), path_offpol + ".pt") def load(self, path, load_optimizer=True, load_actor_std=True): loaded_dict = torch.load(path) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 6823a6d4..ffad7a99 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -143,7 +143,7 @@ def learn(self, states_to_log_dict=None): logger.toc("runtime") logger.print_to_terminal() - self.save(end=True) + self.save() @torch.no_grad def burn_in_normalization(self, n_iterations=100): @@ -190,7 +190,7 @@ def set_up_logger(self): logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) - def save(self, end=False): + def save(self, save_storage=False): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) torch.save( @@ -203,8 +203,7 @@ def save(self, end=False): }, path, ) - # Save data if training hasn't ended, otherwise storage was cleared - if not end: + if save_storage: path_data = os.path.join(self.log_dir, "data_{}".format(self.it)) torch.save(storage.data.cpu(), path_data + ".pt") diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py index 0e0de257..87f60071 100644 --- a/scripts/eval_rewards.py +++ b/scripts/eval_rewards.py @@ -1,29 +1,33 @@ -from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry +from gym.utils.helpers import class_to_dict + +from learning.runners.finetune_runner import FineTuneRunner from gym import LEGGED_GYM_ROOT_DIR import os -import torch import scipy.io import numpy as np import pandas as pd import matplotlib.pyplot as plt ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -LOAD_RUN = "Jul12_15-53-57_IPG32_S16" +SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE +LOAD_RUN = "Jul23_00-14-23_nu02_B8" -REWARDS_FILE = "rewards_nu1.csv" # generate this file from logs, if None: just plot +REWARDS_FILE = None # generate this file from logs, if None: just plot PLOT_REWARDS = { + "Nu=0.0": "rewards_nu0.csv", "Nu=0.5": "rewards_nu05.csv", - "Nu=0.8": "rewards_nu08.csv", - "Nu=1.0": "rewards_nu1.csv", + "Nu=0.9": "rewards_nu09.csv", + "Nu=0.9 new": "rewards_nu09_new.csv", } # Data struct fields from Robot-Software logs DATA_LIST = [ "header", - "base_height", # 1 shape: (1, data_length) - # the following are all shape: (data_length, n) + "base_height", # 1 "base_lin_vel", # 2 "base_ang_vel", # 3 "projected_gravity", # 4 @@ -52,59 +56,7 @@ DEVICE = "cuda" -def get_rewards(it, data_struct, rewards_path): - data_length = data_struct[1].shape[1] # base_height: shape (1, data_length) - - mini_cheetah = MinimalistCheetah(device=DEVICE) - rewards_dict = {name: [] for name in REWARD_WEIGHTS.keys()} # for plotting - rewards_all = torch.empty(0).to(DEVICE) - - for i in range(data_length): - mini_cheetah.set_states( - base_height=data_struct[1][:, i], # shape (1, data_length) - base_lin_vel=data_struct[2][i], - base_ang_vel=data_struct[3][i], - proj_gravity=data_struct[4][i], - commands=data_struct[5][i], - dof_pos_obs=data_struct[6][i], - dof_vel=data_struct[7][i], - phase_obs=data_struct[8][i], - grf=data_struct[9][i], - dof_pos_target=data_struct[10][i], - ) - total_rewards = 0 - for name, weight in REWARD_WEIGHTS.items(): - reward = weight * eval(f"mini_cheetah._reward_{name}()") - rewards_dict[name].append(reward.item()) - total_rewards += reward - rewards_all = torch.cat((rewards_all, total_rewards), dim=0) - # Post process mini cheetah - mini_cheetah.post_process() - - rewards_dict["total"] = rewards_all.tolist() - - # Save rewards in dataframe - if not os.path.exists(rewards_path): - rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) - else: - rewards_df = pd.read_csv(rewards_path) - for name, rewards in rewards_dict.items(): - rewards = np.array(rewards) - mean = rewards.mean() - std = rewards.std() - rewards_df = rewards_df._append( - { - "iteration": it, - "type": name, - "mean": mean, - "std": std, - }, - ignore_index=True, - ) - rewards_df.to_csv(rewards_path, index=False) - - -def get_data_dict(name="SMOOTH_RL_CONTROLLER"): +def update_rewards_df(runner): data_dict = {} log_files = [ file @@ -116,8 +68,35 @@ def get_data_dict(name="SMOOTH_RL_CONTROLLER"): iteration = int(file.split(".")[0]) path = os.path.join(ROOT_DIR, LOAD_RUN, file) data = scipy.io.loadmat(path) - data_dict[iteration] = data[name][0][0] - return data_dict + data_dict[iteration] = data[runner.data_name][0][0] + + rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) + if os.path.exists(rewards_path): + os.remove(rewards_path) + + # Get rewards from runner + for it, data_struct in data_dict.items(): + _, rewards_dict = runner.get_data_rewards(data_struct, REWARD_WEIGHTS) + + # Save rewards in dataframe + if not os.path.exists(rewards_path): + rewards_df = pd.DataFrame(columns=["iteration", "type", "mean", "std"]) + else: + rewards_df = pd.read_csv(rewards_path) + for name, rewards in rewards_dict.items(): + rewards = np.array(rewards) + mean = rewards.mean() + std = rewards.std() + rewards_df = rewards_df._append( + { + "iteration": it, + "type": name, + "mean": mean, + "std": std, + }, + ignore_index=True, + ) + rewards_df.to_csv(rewards_path, index=False) def plot_rewards(rewards_df, axs, name): @@ -154,20 +133,39 @@ def plot_rewards(rewards_df, axs, name): axs[i].legend() +def setup(): + args = get_args() + + env_cfg, train_cfg = task_registry.create_cfgs(args) + task_registry.make_gym_and_sim() + env = task_registry.make_env(name=args.task, env_cfg=env_cfg) + + train_cfg = class_to_dict(train_cfg) + log_dir = os.path.join(ROOT_DIR, train_cfg["runner"]["load_run"]) + + runner = FineTuneRunner( + env, + train_cfg, + log_dir, + data_list=DATA_LIST, + device=DEVICE, + ) + + return runner + + if __name__ == "__main__": if REWARDS_FILE is not None: - data_dict = get_data_dict() - rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) - if os.path.exists(rewards_path): - os.remove(rewards_path) - for it, data_struct in data_dict.items(): - get_rewards(it, data_struct, rewards_path) + runner = setup() + if SE_PATH is not None: + runner.load_se(SE_PATH) + update_rewards_df(runner) # Plot rewards stats num_plots = len(REWARD_WEIGHTS) + 1 # +1 for total rewards cols = 5 rows = np.ceil(num_plots / cols).astype(int) - fig, axs = plt.subplots(rows, cols, figsize=(30, 10)) + fig, axs = plt.subplots(rows, cols, figsize=(20, 8)) fig.suptitle("IPG Finetuning Rewards") axs = axs.flatten() diff --git a/scripts/finetune.py b/scripts/finetune.py index ca7d404a..6247a466 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -2,34 +2,28 @@ from gym.utils import get_args, task_registry from gym.utils.helpers import class_to_dict -from learning.algorithms import * # noqa: F403 from learning.runners.finetune_runner import FineTuneRunner -from gym.envs.mini_cheetah.minimalist_cheetah import MinimalistCheetah from gym import LEGGED_GYM_ROOT_DIR import os import torch -import scipy.io import numpy as np -from tensordict import TensorDict ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" +SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE OUTPUT_FILE = "output.txt" USE_SIMULATOR = True -DATA_LENGTH = 2990 # Robot-Software logs must contain at least this many steps # Scales -EXPLORATION_SCALE = 0.8 # used during data collection +EXPLORATION_SCALE = 0.5 # used during data collection ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4) -COMMAND_SCALES = np.array([3.0, 1.0, 3.0]) # Data struct fields from Robot-Software logs DATA_LIST = [ "header", - "base_height", # 1 shape: (1, data_length) - # the following are all shape: (data_length, n) + "base_height", # 1 "base_lin_vel", # 2 "base_ang_vel", # 3 "projected_gravity", # 4 @@ -39,141 +33,29 @@ "phase_obs", # 8 "grf", # 9 "dof_pos_target", # 10 - "exploration_noise", # 11 + "torques", # 11 + "exploration_noise", # 12 "footer", ] DEVICE = "cuda" -def get_obs(obs_list, data_struct): - obs_all = torch.empty(0).to(DEVICE) - for obs_name in obs_list: - data_idx = DATA_LIST.index(obs_name) - obs = torch.tensor(data_struct[data_idx]).to(DEVICE) - obs = obs.squeeze()[:DATA_LENGTH] - obs = obs.reshape((DATA_LENGTH, 1, -1)) # shape (data_length, 1, n) - obs_all = torch.cat((obs_all, obs), dim=-1) - - return obs_all.float() - - -def get_rewards(data_struct, reward_weights, dt): - minimalist_cheetah = MinimalistCheetah(device=DEVICE) - rewards_all = torch.empty(0).to(DEVICE) - - for i in range(DATA_LENGTH): - minimalist_cheetah.set_states( - base_height=data_struct[1][:, i], # shape (1, data_length) - base_lin_vel=data_struct[2][i], - base_ang_vel=data_struct[3][i], - proj_gravity=data_struct[4][i], - commands=data_struct[5][i] * COMMAND_SCALES, - dof_pos_obs=data_struct[6][i], - dof_vel=data_struct[7][i], - phase_obs=data_struct[8][i], - grf=data_struct[9][i], - dof_pos_target=data_struct[10][i], - ) - total_rewards = 0 - for name, weight in reward_weights.items(): - reward = weight * eval(f"minimalist_cheetah._reward_{name}()") - total_rewards += dt * reward # scaled - rewards_all = torch.cat((rewards_all, total_rewards), dim=0) - # Post process mini cheetah - minimalist_cheetah.post_process() - - return rewards_all.float() - - -def get_data_dict(train_cfg, env_cfg, offpol=False, name="SMOOTH_RL_CONTROLLER"): - load_run = train_cfg["runner"]["load_run"] - checkpoint = train_cfg["runner"]["checkpoint"] - run_dir = os.path.join(ROOT_DIR, load_run) - - if offpol: - # All files up until checkpoint - log_files = [ - file - for file in os.listdir(run_dir) - if file.endswith(".mat") and int(file.split(".")[0]) <= checkpoint - ] - log_files = sorted(log_files) - else: - # Single log file for checkpoint - log_files = [str(checkpoint) + ".mat"] - - # Initialize data dict - data = scipy.io.loadmat(os.path.join(run_dir, log_files[0])) - batch_size = (DATA_LENGTH - 1, len(log_files)) # -1 for next_obs - data_dict = TensorDict({}, device=DEVICE, batch_size=batch_size) - - # Get all data - actor_obs_all = torch.empty(0).to(DEVICE) - critic_obs_all = torch.empty(0).to(DEVICE) - actions_all = torch.empty(0).to(DEVICE) - rewards_all = torch.empty(0).to(DEVICE) - for log in log_files: - data = scipy.io.loadmat(os.path.join(run_dir, log)) - data_struct = data[name][0][0] - - actor_obs = get_obs(train_cfg["actor"]["obs"], data_struct) - critic_obs = get_obs(train_cfg["critic"]["obs"], data_struct) - actor_obs_all = torch.cat((actor_obs_all, actor_obs), dim=1) - critic_obs_all = torch.cat((critic_obs_all, critic_obs), dim=1) - - actions_idx = DATA_LIST.index("dof_pos_target") - actions = torch.tensor(data_struct[actions_idx]).to(DEVICE).float() - actions = actions[:DATA_LENGTH] - actions = actions.reshape((DATA_LENGTH, 1, -1)) # shape (data_length, 1, n) - actions_all = torch.cat((actions_all, actions), dim=1) - - reward_weights = train_cfg["critic"]["reward"]["weights"] - dt = 1.0 / env_cfg["control"]["ctrl_frequency"] - rewards = get_rewards(data_struct, reward_weights, dt) - rewards = rewards[:DATA_LENGTH] - rewards = rewards.reshape((DATA_LENGTH, 1)) # shape (data_length, 1) - rewards_all = torch.cat((rewards_all, rewards), dim=1) - - data_dict["actor_obs"] = actor_obs_all[:-1] - data_dict["next_actor_obs"] = actor_obs_all[1:] - data_dict["critic_obs"] = critic_obs_all[:-1] - data_dict["next_critic_obs"] = critic_obs_all[1:] - data_dict["actions"] = actions_all[:-1] - data_dict["rewards"] = rewards_all[:-1] - - # No time outs and dones - data_dict["timed_out"] = torch.zeros(batch_size, device=DEVICE, dtype=bool) - data_dict["dones"] = torch.zeros(batch_size, device=DEVICE, dtype=bool) - - return data_dict - - def setup(): args = get_args() env_cfg, train_cfg = task_registry.create_cfgs(args) task_registry.make_gym_and_sim() - - if USE_SIMULATOR: - env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - else: - env = None + env = task_registry.make_env(name=args.task, env_cfg=env_cfg) train_cfg = class_to_dict(train_cfg) - env_cfg = class_to_dict(env_cfg) - data_onpol = get_data_dict(train_cfg, env_cfg, offpol=False) - - if train_cfg["runner"]["algorithm_class_name"] == "PPO_IPG": - data_offpol = get_data_dict(train_cfg, env_cfg, offpol=True) - else: - data_offpol = None + log_dir = os.path.join(ROOT_DIR, train_cfg["runner"]["load_run"]) runner = FineTuneRunner( env, train_cfg, - data_onpol, - data_offpol, + log_dir, + data_list=DATA_LIST, exploration_scale=EXPLORATION_SCALE, device=DEVICE, ) @@ -187,6 +69,9 @@ def finetune(runner): load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt") runner.load(load_path) + if SE_PATH is not None: + runner.load_se(SE_PATH) + # Perform a single update runner.learn() diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh index fa81966f..95e84728 100644 --- a/scripts/finetune_multiple.sh +++ b/scripts/finetune_multiple.sh @@ -2,9 +2,10 @@ source /home/lmolnar/miniconda3/etc/profile.d/conda.sh # Args -LOAD_RUN="Jul12_15-53-57_IPG32_S16" -CHECKPOINT=700 -N_RUNS=5 +LOAD_RUN="Jul23_00-14-23_nu02_B8" +CHECKPOINT=1000 +N_RUNS=6 +EVAL=false # make sure to turn exploration on/off in RS # Set directories QGYM_DIR="/home/lmolnar/workspace/QGym" @@ -39,9 +40,11 @@ do cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR} # Finetune in QGym - conda deactivate - conda activate qgym - python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} + if [ "$EVAL" = false ] ; then + conda deactivate + conda activate qgym + python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} + fi # Copy policy to RS CHECKPOINT=$((CHECKPOINT + 1)) diff --git a/scripts/generate_commands.py b/scripts/generate_commands.py index c1de1425..e8ef18a4 100644 --- a/scripts/generate_commands.py +++ b/scripts/generate_commands.py @@ -1,10 +1,15 @@ import numpy as np import random -# Command ranges -x_range = [-0.5, 1] # [m/s] -y_range = [-0.25, 0.25] # [m/s] -yaw_range = [-1, 1] # [rad/s] +# Command ranges during training: +# x_range = [-2.0, 3.0] # [m/s] +# y_range = [-1.0, 1.0] # [m/s] +# yaw_range = [-3.0. 3.0] # [rad/s] + +# Command ranges for finetuning: +x_range = [-1.0, 1.5] # [m/s] +y_range = [-0.5, 0.5] # [m/s] +yaw_range = [-1.5, 1.5] # [rad/s] # Generate random command sequence N = 10 From edf31679a25e106ca82b0f6690f07e3118d24ed9 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Tue, 23 Jul 2024 18:11:50 -0400 Subject: [PATCH 40/42] bugfix scale action noise, added scaling and refactorings --- gym/envs/base/task_skeleton.py | 3 ++- gym/envs/mini_cheetah/minimalist_cheetah.py | 13 ++++++++++--- learning/algorithms/ppo_ipg.py | 2 +- learning/modules/smooth_actor.py | 2 +- scripts/train.py | 5 ++--- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index ffbac6ce..0974970a 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -45,6 +45,7 @@ def reset(self): """Reset all robots""" self._reset_idx(torch.arange(self.num_envs, device=self.device)) self.step() + self.episode_length_buf[:] = 0 def _reset_buffers(self): self.to_be_reset[:] = False @@ -67,7 +68,7 @@ def _eval_reward(self, name): def _check_terminations_and_timeouts(self): """Check if environments need to be reset""" contact_forces = self.contact_forces[:, self.termination_contact_indices, :] - self.terminated = torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) self.timed_out = self.episode_length_buf >= self.max_episode_length self.to_be_reset = self.timed_out | self.terminated diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index 48b7e47a..4702f6ac 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -22,6 +22,8 @@ def __init__( # Scales self.command_scales = torch.tensor([3.0, 1.0, 3.0]).to(self.device) + self.dof_pos_scales = torch.tensor([0.2, 0.3, 0.3]).to(self.device).repeat(4) + self.dof_vel_scales = torch.tensor([2.0, 2.0, 4.0]).to(self.device).repeat(4) # Previous 2 dof pos targets self.dof_target_prev = None @@ -49,15 +51,20 @@ def set_states( torch.tensor(commands, device=self.device).unsqueeze(0) * self.command_scales ) - self.dof_pos_obs = torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0) - self.dof_vel = torch.tensor(dof_vel, device=self.device).unsqueeze(0) + self.dof_pos_obs = ( + torch.tensor(dof_pos_obs, device=self.device).unsqueeze(0) + * self.dof_pos_scales + ) + self.dof_vel = ( + torch.tensor(dof_vel, device=self.device).unsqueeze(0) * self.dof_vel_scales + ) self.grf = torch.tensor(grf, device=self.device).unsqueeze(0) # Compute phase sin phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0) self.phase_sin = phase_obs[:, 0].unsqueeze(0) - # Set targets + # Set targets, these are scaled in Robot-Software self.dof_pos_target = torch.tensor( dof_pos_target, device=self.device ).unsqueeze(0) diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index e15ab46d..0dd7907f 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -256,7 +256,7 @@ def update_actor(self, data_onpol, data_offpol): nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.optimizer.step() self.mean_surrogate_loss += loss_onpol.item() - self.mean_offpol_loss += loss_offpol.item() + self.mean_offpol_loss += b * loss_offpol.item() counter += 1 self.mean_surrogate_loss /= counter self.mean_offpol_loss /= counter diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 72c4ace1..f824d263 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -107,7 +107,7 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) mean = self.distribution.mean - sample = mean + self.get_noise() + sample = mean + self.get_noise() * self.exploration_scale if self.debug: path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv" self.log_actions(mean[0][2], sample[0][2], path) diff --git a/scripts/train.py b/scripts/train.py index 283717e8..40966682 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,5 +1,5 @@ from gym.envs import __init__ # noqa: F401 -from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils import get_args, task_registry from gym.utils.logging_and_saving import wandb_singleton from gym.utils.logging_and_saving import local_code_save_helper @@ -12,9 +12,8 @@ def setup(): task_registry.make_gym_and_sim() wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) - randomize_episode_counters(env) + # randomize_episode_counters(env) - randomize_episode_counters(env) policy_runner = task_registry.make_alg_runner(env, train_cfg) local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) From 276f84b0bc59bc2e1c2398f7d44e7c50532b221b Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 26 Jul 2024 11:23:11 -0400 Subject: [PATCH 41/42] merged LinkedIPG, bugfix target network save, refactorings --- gym/envs/pendulum/pendulum_config.py | 7 +- gym/utils/helpers.py | 8 + learning/algorithms/__init__.py | 1 + learning/algorithms/linked_ipg.py | 316 +++++++++++++++++++++++++++ learning/algorithms/ppo_ipg.py | 12 +- learning/runners/finetune_runner.py | 114 +++++++--- learning/runners/ipg_runner.py | 12 +- scripts/eval_rewards.py | 24 +- scripts/finetune.py | 69 +++++- scripts/finetune_multiple.sh | 20 +- scripts/visualize_ipg.py | 6 +- 11 files changed, 508 insertions(+), 81 deletions(-) create mode 100644 learning/algorithms/linked_ipg.py diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index 399060a2..51c44557 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -125,16 +125,17 @@ class algorithm(FixedRobotCfgPPO.algorithm): desired_kl = 0.01 # IPG - polyak = 0.995 + polyak = 0.9 use_cv = False - inter_nu = 0.2 + inter_nu = 0.5 beta = "off_policy" storage_size = 4 * 100 * 4096 # num_policies*num_steps*num_envs + val_interpolation = 0.5 # 0: use V(s'), 1: use Q(s', pi(s')) class runner(FixedRobotCfgPPO.runner): run_name = "" experiment_name = "pendulum" max_iterations = 100 # number of policy updates - algorithm_class_name = "PPO_IPG" + algorithm_class_name = "LinkedIPG" num_steps_per_env = 100 save_interval = 20 diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 90af8bce..2f3a5dcf 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -181,6 +181,9 @@ def update_cfg_from_args(env_cfg, train_cfg, args): train_cfg.runner.checkpoint = args.checkpoint if args.rl_device is not None: train_cfg.runner.device = args.rl_device + # * IPG parameters + if args.inter_nu is not None: + train_cfg.algorithm.inter_nu = args.inter_nu def get_args(custom_parameters=None): @@ -301,6 +304,11 @@ def get_args(custom_parameters=None): "default": False, "help": "Use original config file for loaded policy.", }, + { + "name": "--inter_nu", + "type": float, + "help": "Interpolation parameter for IPG.", + }, ] # * parse arguments args = gymutil.parse_arguments( diff --git a/learning/algorithms/__init__.py b/learning/algorithms/__init__.py index b775a439..8bee38d0 100644 --- a/learning/algorithms/__init__.py +++ b/learning/algorithms/__init__.py @@ -33,4 +33,5 @@ from .ppo import PPO from .ppo2 import PPO2 from .ppo_ipg import PPO_IPG +from .linked_ipg import LinkedIPG from .SE import StateEstimator diff --git a/learning/algorithms/linked_ipg.py b/learning/algorithms/linked_ipg.py new file mode 100644 index 00000000..de024276 --- /dev/null +++ b/learning/algorithms/linked_ipg.py @@ -0,0 +1,316 @@ +import torch +import torch.nn as nn +import torch.optim as optim + +from learning.utils import ( + create_uniform_generator, + compute_generalized_advantages, + normalize, + polyak_update, +) + + +class LinkedIPG: + def __init__( + self, + actor, + critic_v, + critic_q, + target_critic_q, + batch_size=2**15, + max_gradient_steps=10, + clip_param=0.2, + gamma=0.998, + lam=0.95, + entropy_coef=0.0, + learning_rate=1e-3, + max_grad_norm=1.0, + use_clipped_value_loss=True, + schedule="fixed", + desired_kl=0.01, + polyak=0.995, + use_cv=False, + inter_nu=0.2, + beta="off_policy", + device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, + val_interpolation=0.5, + **kwargs, + ): + self.device = device + + self.desired_kl = desired_kl + self.schedule = schedule + self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio + + # * PPO components + self.actor = actor.to(self.device) + self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) + self.critic_v = critic_v.to(self.device) + self.critic_v_optimizer = optim.Adam( + self.critic_v.parameters(), lr=learning_rate + ) + + # * IPG components + self.critic_q = critic_q.to(self.device) + self.critic_q_optimizer = optim.Adam( + self.critic_q.parameters(), lr=learning_rate + ) + self.target_critic_q = target_critic_q.to(self.device) + self.target_critic_q.load_state_dict(self.critic_q.state_dict()) + + # * PPO parameters + self.clip_param = clip_param + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps + self.entropy_coef = entropy_coef + self.gamma = gamma + self.lam = lam + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + # * IPG parameters + self.polyak = polyak + self.use_cv = use_cv + self.inter_nu = inter_nu + self.beta = beta + self.val_interpolation = val_interpolation + + def switch_to_train(self): + self.actor.train() + self.critic_v.train() + self.critic_q.train() + + def act(self, obs): + return self.actor.act(obs).detach() + + def update(self, data_onpol, data_offpol): + # On-policy GAE + data_onpol["values"] = self.critic_v.evaluate(data_onpol["critic_obs"]) + data_onpol["advantages"] = compute_generalized_advantages( + data_onpol, self.gamma, self.lam, self.critic_v + ) + data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"] + data_onpol["advantages"] = normalize(data_onpol["advantages"]) + + data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"]) + data_offpol["advantages"] = compute_generalized_advantages( + data_offpol, self.gamma, self.lam, self.critic_v + ) + data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"] + + self.update_critic_v(data_offpol) + self.update_critic_q(data_offpol) + self.update_actor(data_onpol, data_offpol) + + # def update_joint_critics(self, data_onpol, data_offpol): + # self.mean_q_loss = 0 + # self.mean_value_loss = 0 + # counter = 0 + # generator_onpol = create_uniform_generator( + # data_onpol, + # self.batch_size, + # max_gradient_steps=self.max_gradient_steps, + # ) + # generator_offpol = create_uniform_generator( + # data_offpol, + # self.batch_size, + # max_gradient_steps=self.max_gradient_steps, + # ) + # for batch_onpol, batch_offpol in zip(generator_onpol, generator_offpol): + # with torch.no_grad(): + # action_next_onpol = self.actor.act_inference( + # batch_onpol["next_actor_obs"] + # ) + # q_input_next_onpol = torch.cat( + # batch_onpol["next_critic_obs"], action_next_onpol + # ) + # action_next_offpol = self.actor.act_inference( + # batch_offpol["next_actor_obs"] + # ) + # q_input_next_offpol = torch.cat( + # batch_offpol["next_critic_obs"], action_next_offpol + # ) + # q_value_offpol = self.critic_q.evaluate(q_input_next_offpol) + + # loss_V_returns = self.critic_v.loss_fn( + # batch_onpol["critic_obs"], batch_onpol["returns"] + # ) + # loss_V_Q = nn.functional.mse_loss( + # self.critic_v.evaluate(batch_onpol["critic_obs"]), + # self.critic_q.evaluate(batch_onpol["critic_obs"]), + # reduction="mean") + # + # with torch.no_grad(): + # action_next = self.actor.act_inference(batch_offpol["next_actor_obs"]) + # q_input_next = torch.cat( + # (batch_offpol["next_critic_obs"], action_next), dim=-1 + # ) + + def update_critic_q(self, data): + self.mean_q_loss = 0 + counter = 0 + + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch in generator: + with torch.no_grad(): + action_next = self.actor.act_inference(batch["next_actor_obs"]) + q_input_next = torch.cat( + (batch["next_critic_obs"], action_next), dim=-1 + ) + q_next = self.target_critic_q.evaluate(q_input_next) + v_next = self.critic_v.evaluate(batch["next_critic_obs"]) + q_target = batch["rewards"] + batch["dones"].logical_not() * ( + self.gamma + * ( + q_next * self.val_interpolation + + v_next * (1 - self.val_interpolation) + ) + ) + q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) + q_loss = self.critic_q.loss_fn(q_input, q_target) + self.critic_q_optimizer.zero_grad() + q_loss.backward() + nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm) + self.critic_q_optimizer.step() + self.mean_q_loss += q_loss.item() + counter += 1 + + # TODO: check where to do polyak update (IPG repo does it here) + self.target_critic_q = polyak_update( + self.critic_q, self.target_critic_q, self.polyak + ) + self.mean_q_loss /= counter + + def update_critic_v(self, data): + self.mean_value_loss = 0 + counter = 0 + + generator = create_uniform_generator( + data, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch in generator: + value_loss = self.critic_v.loss_fn(batch["critic_obs"], batch["returns"]) + self.critic_v_optimizer.zero_grad() + value_loss.backward() + nn.utils.clip_grad_norm_(self.critic_v.parameters(), self.max_grad_norm) + self.critic_v_optimizer.step() + self.mean_value_loss += value_loss.item() + counter += 1 + self.mean_value_loss /= counter + + def update_actor(self, data_onpol, data_offpol): + self.mean_surrogate_loss = 0 + self.mean_offpol_loss = 0 + counter = 0 + + self.actor.update_distribution(data_onpol["actor_obs"]) + data_onpol["old_sigma"] = self.actor.action_std.detach() + data_onpol["old_mu"] = self.actor.action_mean.detach() + data_onpol["old_actions_log_prob"] = self.actor.get_actions_log_prob( + data_onpol["actions"] + ).detach() + + # Generate off-policy batches and use all on-policy data + generator_offpol = create_uniform_generator( + data_offpol, + self.batch_size, + max_gradient_steps=self.max_gradient_steps, + ) + for batch_offpol in generator_offpol: + self.actor.update_distribution(data_onpol["actor_obs"]) + actions_log_prob_onpol = self.actor.get_actions_log_prob( + data_onpol["actions"] + ) + mu_onpol = self.actor.action_mean + sigma_onpol = self.actor.action_std + + # * KL + if self.desired_kl is not None and self.schedule == "adaptive": + with torch.inference_mode(): + kl = torch.sum( + torch.log(sigma_onpol / data_onpol["old_sigma"] + 1.0e-5) + + ( + torch.square(data_onpol["old_sigma"]) + + torch.square(data_onpol["old_mu"] - mu_onpol) + ) + / (2.0 * torch.square(sigma_onpol)) + - 0.5, + axis=-1, + ) + kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range + + if kl_mean > self.desired_kl * 2.0: + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) + elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) + + for param_group in self.optimizer.param_groups: + # ! check this + param_group["lr"] = self.learning_rate + + # * On-policy surrogate loss + adv_onpol = data_onpol["advantages"] + if self.use_cv: + # TODO: control variate + critic_based_adv = 0 # get_control_variate(data_onpol, self.critic_v) + learning_signals = (adv_onpol - critic_based_adv) * (1 - self.inter_nu) + else: + learning_signals = adv_onpol * (1 - self.inter_nu) + + ratio = torch.exp( + actions_log_prob_onpol - data_onpol["old_actions_log_prob"] + ) + ratio_clipped = torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + surrogate = -learning_signals * ratio + surrogate_clipped = -learning_signals * ratio_clipped + loss_onpol = torch.max(surrogate, surrogate_clipped).mean() + + # * Off-policy loss + if self.beta == "on_policy": + loss_offpol = self.compute_loss_offpol(data_onpol) + elif self.beta == "off_policy": + loss_offpol = self.compute_loss_offpol(batch_offpol) + else: + raise ValueError(f"Invalid beta value: {self.beta}") + + if self.use_cv: + b = 1 + else: + b = self.inter_nu + + loss = loss_onpol + b * loss_offpol + + # * Gradient step + self.optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) + self.optimizer.step() + self.mean_surrogate_loss += loss_onpol.item() + self.mean_offpol_loss += b * loss_offpol.item() + counter += 1 + self.mean_surrogate_loss /= counter + self.mean_offpol_loss /= counter + + def compute_loss_offpol(self, data): + obs = data["actor_obs"] + actions = self.actor.act_inference(obs) + q_input = torch.cat((data["critic_obs"], actions), dim=-1) + q_value = self.critic_q.evaluate(q_input) + return -q_value.mean() diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index 0dd7907f..0738b86d 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -95,17 +95,8 @@ def update(self, data_onpol, data_offpol): data_onpol["advantages"] = normalize(data_onpol["advantages"]) self.update_critic_q(data_offpol) - - # Freeze Q-network params - for param in self.critic_q.parameters(): - param.requires_grad = False - - self.update_actor(data_onpol, data_offpol) self.update_critic_v(data_onpol) - - # Un-freeze Q-network params - for param in self.critic_q.parameters(): - param.requires_grad = True + self.update_actor(data_onpol, data_offpol) def update_critic_q(self, data): self.mean_q_loss = 0 @@ -129,6 +120,7 @@ def update_critic_q(self, data): ) q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) q_loss = self.critic_q.loss_fn(q_input, q_target) + print(q_loss.item()) self.critic_q_optimizer.zero_grad() q_loss.backward() nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm) diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index bc8a0eeb..d8b37756 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -20,19 +20,27 @@ def __init__( train_cfg, log_dir, data_list, - data_length=2990, + data_length=1500, data_name="SMOOTH_RL_CONTROLLER", + se_path=None, + use_simulator=True, exploration_scale=1.0, device="cpu", ): - # Init calls _set_up_alg which needs exploration scale - self.exploration_scale = exploration_scale - super().__init__(env, train_cfg, device) + # Instead of super init, only set necessary attributes + self.env = env + self.parse_train_cfg(train_cfg) + self.num_steps_per_env = self.cfg["num_steps_per_env"] self.log_dir = log_dir self.data_list = data_list # Describes structure of Robot-Software logs self.data_length = data_length # Logs must contain at least this many steps self.data_name = data_name + self.se_path = se_path + self.use_simulator = use_simulator + self.exploration_scale = exploration_scale + self.device = device + self._set_up_alg() def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) @@ -48,21 +56,11 @@ def _set_up_alg(self): else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) - if "state_estimator" in self.train_cfg.keys(): - self.se_cfg = self.train_cfg["state_estimator"] - state_estimator_network = StateEstimatorNN( - self.get_obs_size(self.se_cfg["obs"]), - self.get_obs_size(self.se_cfg["targets"]), - **self.se_cfg["network"], - ) - self.SE = StateEstimator( - state_estimator_network, device=self.device, **self.se_cfg - ) - else: - self.SE = None + alg_name = self.cfg["algorithm_class_name"] + alg_class = eval(alg_name) + self.ipg = alg_name in ["PPO_IPG", "LinkedIPG"] - alg_class = eval(self.cfg["algorithm_class_name"]) - if self.cfg["algorithm_class_name"] == "PPO_IPG": + if self.ipg: critic_v = Critic(num_critic_obs, **self.critic_cfg) critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) target_critic_q = Critic(num_critic_obs + num_actions, **self.critic_cfg) @@ -78,6 +76,20 @@ def _set_up_alg(self): critic = Critic(num_critic_obs, **self.critic_cfg) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) + if "state_estimator" in self.train_cfg.keys() and self.se_path is not None: + self.se_cfg = self.train_cfg["state_estimator"] + state_estimator_network = StateEstimatorNN( + self.get_obs_size(self.se_cfg["obs"]), + self.get_obs_size(self.se_cfg["targets"]), + **self.se_cfg["network"], + ) + self.SE = StateEstimator( + state_estimator_network, device=self.device, **self.se_cfg + ) + self.load_se(self.se_path) + else: + self.SE = None + def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] self.alg_cfg = train_cfg["algorithm"] @@ -85,7 +97,10 @@ def parse_train_cfg(self, train_cfg): self.critic_cfg = train_cfg["critic"] self.train_cfg = train_cfg - def get_data_dict(self, offpol=False): + def get_data_dict(self, offpol=False, load_path=None, save_path=None): + # Concatenate data with loaded dict + loaded_data_dict = torch.load(load_path) if load_path else None + checkpoint = self.cfg["checkpoint"] if offpol: # All files up until checkpoint @@ -147,6 +162,26 @@ def get_data_dict(self, offpol=False): data_dict["timed_out"] = torch.zeros(batch_size, device=self.device, dtype=bool) data_dict["dones"] = torch.zeros(batch_size, device=self.device, dtype=bool) + # Concatenate with loaded dict + if loaded_data_dict is not None: + loaded_batch_size = loaded_data_dict.batch_size + assert loaded_batch_size[0] == batch_size[0] + new_batch_size = ( + loaded_batch_size[0], + loaded_batch_size[1] + batch_size[1], + ) + data_dict = TensorDict( + { + key: torch.cat((loaded_data_dict[key], data_dict[key]), dim=1) + for key in data_dict.keys() + }, + device=self.device, + batch_size=new_batch_size, + ) + + if save_path: + torch.save(data_dict, save_path) + return data_dict def get_data_obs(self, obs_list, data_struct): @@ -160,8 +195,9 @@ def get_data_obs(self, obs_list, data_struct): return obs_all.float() - def get_data_rewards(self, data_struct, reward_weights, dt=0.01): - minimalist_cheetah = MinimalistCheetah(device=self.device) + def get_data_rewards(self, data_struct, reward_weights): + ctrl_dt = 1.0 / self.env.cfg.control.ctrl_frequency + minimalist_cheetah = MinimalistCheetah(ctrl_dt=ctrl_dt, device=self.device) rewards_dict = {name: [] for name in reward_weights.keys()} # for plotting rewards_all = torch.empty(0).to(self.device) @@ -188,7 +224,7 @@ def get_data_rewards(self, data_struct, reward_weights, dt=0.01): minimalist_cheetah.post_process() rewards_dict["total"] = rewards_all.tolist() - rewards_all *= dt # scaled for alg update + rewards_all *= ctrl_dt # scaled for alg update return rewards_all.float(), rewards_dict @@ -213,10 +249,9 @@ def update_state_estimates(self): ) idx += dim - def learn(self): - self.alg.switch_to_train() - - if self.env is not None: + def load_data(self, load_path=None, save_path=None): + # Load on- and off-policy data + if self.use_simulator: # Simulate on-policy data self.data_onpol = TensorDict( self.get_sim_data(), @@ -226,12 +261,21 @@ def learn(self): else: self.data_onpol = self.get_data_dict() - # Single alg update on data - if self.cfg["algorithm_class_name"] == "PPO_IPG": - self.data_offpol = self.get_data_dict(offpol=True) - self.alg.update(self.data_onpol, self.data_offpol) + if self.ipg: + self.data_offpol = self.get_data_dict( + offpol=True, load_path=load_path, save_path=save_path + ) else: + self.data_offpol = None + + def learn(self): + self.alg.switch_to_train() + + # Single alg update on data + if self.data_offpol is None: self.alg.update(self.data_onpol) + else: + self.alg.update(self.data_onpol, self.data_offpol) def get_sim_data(self): rewards_dict = {} @@ -328,16 +372,16 @@ def update_rewards(self, rewards_dict, terminated): ) def save(self, path): - if self.cfg["algorithm_class_name"] == "PPO_IPG": + if self.ipg: torch.save( { "actor_state_dict": self.alg.actor.state_dict(), "critic_v_state_dict": self.alg.critic_v.state_dict(), "critic_q_state_dict": self.alg.critic_q.state_dict(), + "target_critic_q_state_dict": self.alg.target_critic_q.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(), "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(), - "iter": 1, # only one iteration }, path, ) @@ -348,7 +392,6 @@ def save(self, path): "critic_state_dict": self.alg.critic.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), - "iter": 1, # only one iteration }, path, ) @@ -357,9 +400,12 @@ def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) self.alg.actor.load_state_dict(loaded_dict["actor_state_dict"]) - if self.cfg["algorithm_class_name"] == "PPO_IPG": + if self.ipg: self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"]) self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) + self.alg.target_critic_q.load_state_dict( + loaded_dict["target_critic_q_state_dict"] + ) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.alg.critic_v_optimizer.load_state_dict( diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py index e46f5665..f5bcee26 100644 --- a/learning/runners/ipg_runner.py +++ b/learning/runners/ipg_runner.py @@ -27,8 +27,8 @@ def __init__(self, env, train_cfg, device="cpu"): def _set_up_alg(self): alg_class_name = self.cfg["algorithm_class_name"] - if alg_class_name != "PPO_IPG": - raise ValueError("IPGRunner only supports PPO_IPG") + if alg_class_name not in ["PPO_IPG", "LinkedIPG"]: + raise ValueError("IPGRunner only supports PPO_IPG or Linked_IPG") alg_class = eval(alg_class_name) num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) @@ -164,7 +164,7 @@ def learn(self): logger.toc("learning") if self.it % self.save_interval == 0: - self.save() + self.save(save_storage=False) storage_onpol.clear() # only clear on-policy storage logger.log_all_categories() @@ -173,7 +173,7 @@ def learn(self): logger.toc("runtime") logger.print_to_terminal() - self.save() + # self.save() @torch.no_grad def burn_in_normalization(self, n_iterations=100): @@ -240,6 +240,7 @@ def save(self, save_storage=False): "actor_state_dict": self.alg.actor.state_dict(), "critic_v_state_dict": self.alg.critic_v.state_dict(), "critic_q_state_dict": self.alg.critic_q.state_dict(), + "target_critic_q_state_dict": self.alg.target_critic_q.state_dict(), "optimizer_state_dict": self.alg.optimizer.state_dict(), "critic_v_opt_state_dict": self.alg.critic_v_optimizer.state_dict(), "critic_q_opt_state_dict": self.alg.critic_q_optimizer.state_dict(), @@ -263,6 +264,9 @@ def load(self, path, load_optimizer=True, load_actor_std=True): self.alg.actor.std = nn.Parameter(std_init) self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"]) self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) + self.alg.target_critic_q.load_state_dict( + loaded_dict["target_critic_q_state_dict"] + ) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.alg.critic_v_optimizer.load_state_dict( diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py index 87f60071..340c3cf9 100644 --- a/scripts/eval_rewards.py +++ b/scripts/eval_rewards.py @@ -13,15 +13,16 @@ import matplotlib.pyplot as plt ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE -LOAD_RUN = "Jul23_00-14-23_nu02_B8" +# SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE +SE_PATH = None +LOAD_RUN = "Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08" REWARDS_FILE = None # generate this file from logs, if None: just plot + PLOT_REWARDS = { - "Nu=0.0": "rewards_nu0.csv", - "Nu=0.5": "rewards_nu05.csv", - "Nu=0.9": "rewards_nu09.csv", - "Nu=0.9 new": "rewards_nu09_new.csv", + "Nu=0.5 sim onpol concat": "rewards_nu05_sim_concat.csv", + "Nu=0.5 sim onpol": "rewards_nu05_sim_onpol.csv", + "Nu=0.5 real onpol": "rewards_nu05_real_onpol.csv", } # Data struct fields from Robot-Software logs @@ -37,7 +38,8 @@ "phase_obs", # 8 "grf", # 9 "dof_pos_target", # 10 - "exploration_noise", # 11 + "torques", # 11 + "exploration_noise", # 12 "footer", ] @@ -68,7 +70,10 @@ def update_rewards_df(runner): iteration = int(file.split(".")[0]) path = os.path.join(ROOT_DIR, LOAD_RUN, file) data = scipy.io.loadmat(path) - data_dict[iteration] = data[runner.data_name][0][0] + runner.data_struct = data[runner.data_name][0][0] + if runner.SE: + runner.update_state_estimates() + data_dict[iteration] = runner.data_struct rewards_path = os.path.join(ROOT_DIR, LOAD_RUN, REWARDS_FILE) if os.path.exists(rewards_path): @@ -149,6 +154,7 @@ def setup(): log_dir, data_list=DATA_LIST, device=DEVICE, + se_path=SE_PATH, ) return runner @@ -157,8 +163,6 @@ def setup(): if __name__ == "__main__": if REWARDS_FILE is not None: runner = setup() - if SE_PATH is not None: - runner.load_se(SE_PATH) update_rewards_df(runner) # Plot rewards stats diff --git a/scripts/finetune.py b/scripts/finetune.py index 6247a466..64dac794 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -9,13 +9,20 @@ import os import torch import numpy as np +import pandas as pd ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" -SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE +# SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE +SE_PATH = None OUTPUT_FILE = "output.txt" +LOSSES_FILE = "losses.csv" USE_SIMULATOR = True +# Load/save off-policy storage, this can contain many runs +OFFPOL_LOAD_FILE = "offpol_data_10.pt" +OFFPOL_SAVE_FILE = None + # Scales EXPLORATION_SCALE = 0.5 # used during data collection ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4) @@ -56,6 +63,8 @@ def setup(): train_cfg, log_dir, data_list=DATA_LIST, + se_path=SE_PATH, + use_simulator=USE_SIMULATOR, exploration_scale=EXPLORATION_SCALE, device=DEVICE, ) @@ -64,20 +73,31 @@ def setup(): def finetune(runner): + # Load model load_run = runner.cfg["load_run"] checkpoint = runner.cfg["checkpoint"] - load_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt") - runner.load(load_path) + model_path = os.path.join(ROOT_DIR, load_run, "model_" + str(checkpoint) + ".pt") + runner.load(model_path) - if SE_PATH is not None: - runner.load_se(SE_PATH) + # Load data + load_path = ( + os.path.join(ROOT_DIR, load_run, OFFPOL_LOAD_FILE) if OFFPOL_LOAD_FILE else None + ) + save_path = ( + os.path.join(ROOT_DIR, load_run, OFFPOL_SAVE_FILE) if OFFPOL_SAVE_FILE else None + ) + runner.load_data(load_path=load_path, save_path=save_path) + + # Get old inference actions + action_scales = torch.tensor(ACTION_SCALES).to(DEVICE) + actions_old = action_scales * runner.alg.actor.act_inference( + runner.data_onpol["actor_obs"] + ) # Perform a single update runner.learn() - # Compare old and new actions - actions_old = runner.data_onpol["actions"] - action_scales = torch.tensor(ACTION_SCALES).to(DEVICE) + # Compare old to new actions actions_new = action_scales * runner.alg.actor.act_inference( runner.data_onpol["actor_obs"] ) @@ -92,6 +112,7 @@ def finetune(runner): # Print to output file with open(os.path.join(ROOT_DIR, load_run, OUTPUT_FILE), "a") as f: f.write(f"############ Checkpoint: {checkpoint} #######################\n") + f.write(f"############## Nu={runner.alg.inter_nu} ###################\n") f.write("############### DATA ###############\n") f.write(f"Data on-policy shape: {runner.data_onpol.shape}\n") if runner.data_offpol is not None: @@ -107,6 +128,38 @@ def finetune(runner): f.write(f"Std action diff per actuator: {diff.std(dim=(0, 1))}\n") f.write(f"Overall mean action diff: {diff.mean()}\n") + # Log losses to csv + losses_path = os.path.join(ROOT_DIR, load_run, LOSSES_FILE) + if not os.path.exists(losses_path): + if runner.data_offpol is None: + losses_df = pd.DataFrame( + columns=["checkpoint", "value_loss", "surrogate_loss"] + ) + else: + losses_df = pd.DataFrame( + columns=[ + "checkpoint", + "value_loss", + "q_loss", + "surrogate_loss", + "offpol_loss", + ] + ) + else: + losses_df = pd.read_csv(losses_path) + + append_data = { + "checkpoint": checkpoint, + "value_loss": runner.alg.mean_value_loss, + "surrogate_loss": runner.alg.mean_surrogate_loss, + } + if runner.data_offpol is not None: + append_data["q_loss"] = runner.alg.mean_q_loss + append_data["offpol_loss"] = runner.alg.mean_offpol_loss + + losses_df = losses_df._append(append_data, ignore_index=True) + losses_df.to_csv(losses_path, index=False) + if __name__ == "__main__": runner = setup() diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh index 95e84728..b232a36a 100644 --- a/scripts/finetune_multiple.sh +++ b/scripts/finetune_multiple.sh @@ -2,10 +2,11 @@ source /home/lmolnar/miniconda3/etc/profile.d/conda.sh # Args -LOAD_RUN="Jul23_00-14-23_nu02_B8" +LOAD_RUN="Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08" CHECKPOINT=1000 -N_RUNS=6 -EVAL=false # make sure to turn exploration on/off in RS +N_RUNS=10 +INTER_NU=0.9 # can be fixed or adaptive +EVAL=false # no finetuning, just evaluate without exploration (set in RS) # Set directories QGYM_DIR="/home/lmolnar/workspace/QGym" @@ -19,11 +20,11 @@ cp ${QGYM_LOG_DIR}/exported_${CHECKPOINT}/* ${RS_DIR}/config/systems/quadruped/c for i in $(seq 1 $N_RUNS) do - # Store LCM logs in file labeled with checkpoint - FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT} - if [ -f "$FILE" ]; then - rm "$FILE" - fi + # Store logs in files labeled by checkpoint + LCM_FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT} + MAT_FILE=${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat + rm ${LCM_FILE} + rm ${MAT_FILE} # Run logging script in background ${RS_DIR}/logging/scripts/run_lcm_logger.sh ${CHECKPOINT} & @@ -40,10 +41,11 @@ do cp ${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat ${QGYM_LOG_DIR} # Finetune in QGym + # INTER_NU=$(echo "0.05 * $i" | bc) # adaptive if [ "$EVAL" = false ] ; then conda deactivate conda activate qgym - python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} + python ${QGYM_DIR}/scripts/finetune.py --task=mini_cheetah_finetune --headless --load_run=${LOAD_RUN} --checkpoint=${CHECKPOINT} --inter_nu=${INTER_NU} fi # Copy policy to RS diff --git a/scripts/visualize_ipg.py b/scripts/visualize_ipg.py index 993f4e0d..95b03cad 100644 --- a/scripts/visualize_ipg.py +++ b/scripts/visualize_ipg.py @@ -17,8 +17,8 @@ DEVICE = "cpu" # * Setup -LOAD_RUN = "Jul17_17-22-37_IPG_nu1" -TITLE = "IPG nu=1.0" +LOAD_RUN = "Jul25_11-28-41_LinkedIPG_nu05_v05" +TITLE = "LinkedIPG nu=0.5 blend=0.5 polyak=0.9" IT_RANGE = range(20, 101, 20) RUN_DIR = os.path.join(LEGGED_GYM_ROOT_DIR, "logs", "pendulum", LOAD_RUN) @@ -53,7 +53,7 @@ for it in IT_RANGE: # load data - base_data = torch.load(os.path.join(RUN_DIR, "data_onpol{}.pt".format(it))).to( + base_data = torch.load(os.path.join(RUN_DIR, "data_onpol_{}.pt".format(it))).to( DEVICE ) data = base_data.detach().clone() From 261a4d56880ecfee39f23921871d9643ef4e9a4d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sat, 27 Jul 2024 09:19:27 -0400 Subject: [PATCH 42/42] cleanup --- .../mini_cheetah_finetune_config.py | 13 ++--- .../mini_cheetah/mini_cheetah_ref_config.py | 16 +++++-- gym/envs/mini_cheetah/minimalist_cheetah.py | 11 +++-- learning/algorithms/linked_ipg.py | 15 +++--- learning/algorithms/ppo_ipg.py | 10 +++- learning/runners/finetune_runner.py | 18 +++++-- learning/runners/ipg_runner.py | 9 ++-- scripts/eval_rewards.py | 15 +++--- scripts/finetune.py | 20 ++++---- scripts/finetune_multiple.sh | 5 +- scripts/generate_commands.py | 47 +++++++++++++------ scripts/plot_losses.py | 30 ++++++++++++ 12 files changed, 146 insertions(+), 63 deletions(-) create mode 100644 scripts/plot_losses.py diff --git a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py index f4fd30a8..31f41894 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_finetune_config.py @@ -104,7 +104,7 @@ class weights: tracking_ang_vel = 2.0 orientation = 1.0 min_base_height = 1.5 - stand_still = 2.0 + stand_still = 1.0 swing_grf = 3.0 stance_grf = 3.0 action_rate = 0.01 @@ -138,13 +138,14 @@ class algorithm(MiniCheetahRefRunnerCfg.algorithm): use_cv = False inter_nu = 0.9 beta = "off_policy" - storage_size = 30000 + storage_size = 30_000 + # val_interpolation = 0.8 # Finetuning clip_param = 0.2 - max_gradient_steps = 4 - batch_size = 30000 - learning_rate = 1e-4 + max_gradient_steps = 8 + batch_size = 30_000 + learning_rate = 5e-5 # ACTOR schedule = "fixed" class runner(MiniCheetahRefRunnerCfg.runner): @@ -156,6 +157,6 @@ class runner(MiniCheetahRefRunnerCfg.runner): # Finetuning resume = True - load_run = "Jul23_00-14-23_nu02_B8" + load_run = "Jul24_22-48-41_nu05_B8" checkpoint = 1000 save_interval = 1 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 34c20416..2e808719 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -146,15 +146,23 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): desired_kl = 0.02 # 0.02 for smooth-exploration, else 0.01 # IPG - polyak = 0.995 + polyak = 0.95 use_cv = False inter_nu = 0.2 beta = "off_policy" - storage_size = 8 * 32 * 4096 # num_policies*num_stpes*num_envs + storage_size = 8 * 32 * 4096 # num_policies*num_steps*num_envs + val_interpolation = 0.8 # 0: use V(s'), 1: use Q(s', pi(s')) + learning_rate = 1.0e-3 + lr_range = [1e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 1000 # number of policy updates - algorithm_class_name = "PPO_IPG" + max_iterations = 800 # number of policy updates + algorithm_class_name = "LinkedIPG" num_steps_per_env = 32 + + # resume = False + # load_run = "Jul25_12-24-16_LinkedIPG_50Hz_nu02_v08" + # checkpoint = 1000 diff --git a/gym/envs/mini_cheetah/minimalist_cheetah.py b/gym/envs/mini_cheetah/minimalist_cheetah.py index 4702f6ac..8052697d 100644 --- a/gym/envs/mini_cheetah/minimalist_cheetah.py +++ b/gym/envs/mini_cheetah/minimalist_cheetah.py @@ -60,14 +60,15 @@ def set_states( ) self.grf = torch.tensor(grf, device=self.device).unsqueeze(0) - # Compute phase sin + # Get phase sin phase_obs = torch.tensor(phase_obs, device=self.device).unsqueeze(0) self.phase_sin = phase_obs[:, 0].unsqueeze(0) - # Set targets, these are scaled in Robot-Software - self.dof_pos_target = torch.tensor( - dof_pos_target, device=self.device - ).unsqueeze(0) + # Set target history + self.dof_pos_target = ( + torch.tensor(dof_pos_target, device=self.device).unsqueeze(0) + * self.dof_pos_scales + ) if self.dof_target_prev is None: self.dof_target_prev = self.dof_pos_target if self.dof_target_prev2 is None: diff --git a/learning/algorithms/linked_ipg.py b/learning/algorithms/linked_ipg.py index de024276..dbaef5eb 100644 --- a/learning/algorithms/linked_ipg.py +++ b/learning/algorithms/linked_ipg.py @@ -96,13 +96,14 @@ def update(self, data_onpol, data_offpol): data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"] data_onpol["advantages"] = normalize(data_onpol["advantages"]) - data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"]) - data_offpol["advantages"] = compute_generalized_advantages( - data_offpol, self.gamma, self.lam, self.critic_v - ) - data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"] - - self.update_critic_v(data_offpol) + # TODO: Possibly use off-policy GAE for V-critic update + # data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"]) + # data_offpol["advantages"] = compute_generalized_advantages( + # data_offpol, self.gamma, self.lam, self.critic_v + # ) + # data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"] + + self.update_critic_v(data_onpol) self.update_critic_q(data_offpol) self.update_actor(data_onpol, data_offpol) diff --git a/learning/algorithms/ppo_ipg.py b/learning/algorithms/ppo_ipg.py index 0738b86d..c3f1c5be 100644 --- a/learning/algorithms/ppo_ipg.py +++ b/learning/algorithms/ppo_ipg.py @@ -94,8 +94,15 @@ def update(self, data_onpol, data_offpol): data_onpol["returns"] = data_onpol["advantages"] + data_onpol["values"] data_onpol["advantages"] = normalize(data_onpol["advantages"]) - self.update_critic_q(data_offpol) + # TODO: Possibly use off-policy GAE for V-critic update + # data_offpol["values"] = self.critic_v.evaluate(data_offpol["critic_obs"]) + # data_offpol["advantages"] = compute_generalized_advantages( + # data_offpol, self.gamma, self.lam, self.critic_v + # ) + # data_offpol["returns"] = data_offpol["advantages"] + data_offpol["values"] + self.update_critic_v(data_onpol) + self.update_critic_q(data_offpol) self.update_actor(data_onpol, data_offpol) def update_critic_q(self, data): @@ -120,7 +127,6 @@ def update_critic_q(self, data): ) q_input = torch.cat((batch["critic_obs"], batch["actions"]), dim=-1) q_loss = self.critic_q.loss_fn(q_input, q_target) - print(q_loss.item()) self.critic_q_optimizer.zero_grad() q_loss.backward() nn.utils.clip_grad_norm_(self.critic_q.parameters(), self.max_grad_norm) diff --git a/learning/runners/finetune_runner.py b/learning/runners/finetune_runner.py index d8b37756..c4a33f64 100644 --- a/learning/runners/finetune_runner.py +++ b/learning/runners/finetune_runner.py @@ -20,7 +20,7 @@ def __init__( train_cfg, log_dir, data_list, - data_length=1500, + data_length=3000, data_name="SMOOTH_RL_CONTROLLER", se_path=None, use_simulator=True, @@ -271,6 +271,10 @@ def load_data(self, load_path=None, save_path=None): def learn(self): self.alg.switch_to_train() + # Set fixed actor LR from config + for param_group in self.alg.optimizer.param_groups: + param_group["lr"] = self.alg_cfg["learning_rate"] + # Single alg update on data if self.data_offpol is None: self.alg.update(self.data_onpol) @@ -403,9 +407,15 @@ def load(self, path, load_optimizer=True): if self.ipg: self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"]) self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) - self.alg.target_critic_q.load_state_dict( - loaded_dict["target_critic_q_state_dict"] - ) + # TODO: Possibly always load target with critic_q_state_dict + try: + self.alg.target_critic_q.load_state_dict( + loaded_dict["target_critic_q_state_dict"] + ) + except: + self.alg.target_critic_q.load_state_dict( + loaded_dict["critic_q_state_dict"] + ) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.alg.critic_v_optimizer.load_state_dict( diff --git a/learning/runners/ipg_runner.py b/learning/runners/ipg_runner.py index f5bcee26..252f6098 100644 --- a/learning/runners/ipg_runner.py +++ b/learning/runners/ipg_runner.py @@ -264,9 +264,12 @@ def load(self, path, load_optimizer=True, load_actor_std=True): self.alg.actor.std = nn.Parameter(std_init) self.alg.critic_v.load_state_dict(loaded_dict["critic_v_state_dict"]) self.alg.critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) - self.alg.target_critic_q.load_state_dict( - loaded_dict["target_critic_q_state_dict"] - ) + try: + self.alg.target_critic_q.load_state_dict( + loaded_dict["target_critic_q_state_dict"] + ) + except: + self.alg.target_critic_q.load_state_dict(loaded_dict["critic_q_state_dict"]) if load_optimizer: self.alg.optimizer.load_state_dict(loaded_dict["optimizer_state_dict"]) self.alg.critic_v_optimizer.load_state_dict( diff --git a/scripts/eval_rewards.py b/scripts/eval_rewards.py index 340c3cf9..1c527e5f 100644 --- a/scripts/eval_rewards.py +++ b/scripts/eval_rewards.py @@ -15,14 +15,17 @@ ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" # SE_PATH = f"{LEGGED_GYM_ROOT_DIR}/logs/SE/model_1000.pt" # if None: no SE SE_PATH = None -LOAD_RUN = "Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08" +LOAD_RUN = "Jul24_22-48-41_nu05_B8" -REWARDS_FILE = None # generate this file from logs, if None: just plot +REWARDS_FILE = ( + "rewards_nu09_nosim2.csv" # generate this file from logs, if None: just plot +) PLOT_REWARDS = { - "Nu=0.5 sim onpol concat": "rewards_nu05_sim_concat.csv", - "Nu=0.5 sim onpol": "rewards_nu05_sim_onpol.csv", - "Nu=0.5 real onpol": "rewards_nu05_real_onpol.csv", + "Nu=0.5 no sim": "rewards_nu05_nosim.csv", + "Nu=0.9 no sim": "rewards_nu09_nosim.csv", + "Nu=0.9 no sim 2": "rewards_nu09_nosim2.csv", + "Nu=0.95 no sim": "rewards_nu095_nosim.csv", } # Data struct fields from Robot-Software logs @@ -48,7 +51,7 @@ "tracking_ang_vel": 2.0, "min_base_height": 1.5, "orientation": 1.0, - "stand_still": 2.0, + "stand_still": 1.0, "swing_grf": 3.0, "stance_grf": 3.0, "action_rate": 0.01, diff --git a/scripts/finetune.py b/scripts/finetune.py index 64dac794..4db64925 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -17,14 +17,15 @@ OUTPUT_FILE = "output.txt" LOSSES_FILE = "losses.csv" -USE_SIMULATOR = True +USE_SIMULATOR = False # For on-policy data +DATA_LENGTH = 12_000 # Robot-Software logs must contain at least this many steps # Load/save off-policy storage, this can contain many runs -OFFPOL_LOAD_FILE = "offpol_data_10.pt" -OFFPOL_SAVE_FILE = None +OFFPOL_LOAD_FILE = None # "offpol_data.pt" +OFFPOL_SAVE_FILE = None # "offpol_data.pt" # Scales -EXPLORATION_SCALE = 0.5 # used during data collection +EXPLORATION_SCALE = 0.8 # used during data collection ACTION_SCALES = np.tile(np.array([0.2, 0.3, 0.3]), 4) # Data struct fields from Robot-Software logs @@ -63,6 +64,7 @@ def setup(): train_cfg, log_dir, data_list=DATA_LIST, + data_length=DATA_LENGTH, se_path=SE_PATH, use_simulator=USE_SIMULATOR, exploration_scale=EXPLORATION_SCALE, @@ -131,11 +133,7 @@ def finetune(runner): # Log losses to csv losses_path = os.path.join(ROOT_DIR, load_run, LOSSES_FILE) if not os.path.exists(losses_path): - if runner.data_offpol is None: - losses_df = pd.DataFrame( - columns=["checkpoint", "value_loss", "surrogate_loss"] - ) - else: + if runner.ipg: losses_df = pd.DataFrame( columns=[ "checkpoint", @@ -145,6 +143,10 @@ def finetune(runner): "offpol_loss", ] ) + else: + losses_df = pd.DataFrame( + columns=["checkpoint", "value_loss", "surrogate_loss"] + ) else: losses_df = pd.read_csv(losses_path) diff --git a/scripts/finetune_multiple.sh b/scripts/finetune_multiple.sh index b232a36a..e3dd96c1 100644 --- a/scripts/finetune_multiple.sh +++ b/scripts/finetune_multiple.sh @@ -2,9 +2,9 @@ source /home/lmolnar/miniconda3/etc/profile.d/conda.sh # Args -LOAD_RUN="Jul25_16-06-36_LinkedIPG_50Hz_nu02_v08" +LOAD_RUN="Jul24_22-48-41_nu05_B8" CHECKPOINT=1000 -N_RUNS=10 +N_RUNS=5 INTER_NU=0.9 # can be fixed or adaptive EVAL=false # no finetuning, just evaluate without exploration (set in RS) @@ -24,7 +24,6 @@ do LCM_FILE=${RS_DIR}/logging/lcm_logs/${CHECKPOINT} MAT_FILE=${RS_DIR}/logging/matlab_logs/${CHECKPOINT}.mat rm ${LCM_FILE} - rm ${MAT_FILE} # Run logging script in background ${RS_DIR}/logging/scripts/run_lcm_logger.sh ${CHECKPOINT} & diff --git a/scripts/generate_commands.py b/scripts/generate_commands.py index e8ef18a4..9fe21d79 100644 --- a/scripts/generate_commands.py +++ b/scripts/generate_commands.py @@ -4,23 +4,42 @@ # Command ranges during training: # x_range = [-2.0, 3.0] # [m/s] # y_range = [-1.0, 1.0] # [m/s] -# yaw_range = [-3.0. 3.0] # [rad/s] +# yaw_range = [-3.0, 3.0] # [rad/s] # Command ranges for finetuning: -x_range = [-1.0, 1.5] # [m/s] -y_range = [-0.5, 0.5] # [m/s] -yaw_range = [-1.5, 1.5] # [rad/s] +x_range = [-0.67, 1.0] # [m/s] +y_range = [-0.33, 0.33] # [m/s] +yaw_range = [-2.0, 2.0] # [rad/s] -# Generate random command sequence -N = 10 -steps = 300 -commands = np.zeros((N * steps, 3)) +# Generate structured command sequence (fixed lin/ang vel, yaw, some random) +N = 100 +cmds_zero = np.array([[0, 0, 0]]).repeat(N, axis=0) +cmds = np.zeros((N, 3)) +for _ in range(4): + cmds = np.append(cmds, np.array([[x_range[1], 0, 0]]).repeat(3 * N, axis=0), axis=0) + cmds = np.append(cmds, cmds_zero, axis=0) + cmds = np.append(cmds, np.array([[x_range[0], 0, 0]]).repeat(3 * N, axis=0), axis=0) + cmds = np.append(cmds, cmds_zero, axis=0) + cmds = np.append(cmds, np.array([[0, y_range[1], 0]]).repeat(2 * N, axis=0), axis=0) + cmds = np.append(cmds, cmds_zero, axis=0) + cmds = np.append(cmds, np.array([[0, y_range[0], 0]]).repeat(2 * N, axis=0), axis=0) + cmds = np.append(cmds, cmds_zero, axis=0) + cmds = np.append( + cmds, np.array([[0, 0, yaw_range[1]]]).repeat(2 * N, axis=0), axis=0 + ) + cmds = np.append(cmds, cmds_zero, axis=0) + cmds = np.append( + cmds, np.array([[0, 0, yaw_range[0]]]).repeat(2 * N, axis=0), axis=0 + ) + cmds = np.append(cmds, cmds_zero, axis=0) -for i in range(1, N): # first iteration 0 commands - x = random.uniform(x_range[0], x_range[1]) - y = random.uniform(y_range[0], y_range[1]) - yaw = random.uniform(yaw_range[0], yaw_range[1]) - commands[i * steps : (i + 1) * steps] = np.array([x, y, yaw]) + for i in range(5): + x = random.uniform(x_range[0], x_range[1]) + y = random.uniform(y_range[0], y_range[1]) + yaw = random.uniform(yaw_range[0], yaw_range[1]) + cmds = np.append(cmds, np.array([[x, y, yaw]]).repeat(2 * N, axis=0), axis=0) + +print(cmds.shape) # Export to txt -np.savetxt("commands.txt", commands, fmt="%.3f") +np.savetxt("commands_long.txt", cmds, fmt="%.3f") diff --git a/scripts/plot_losses.py b/scripts/plot_losses.py new file mode 100644 index 00000000..e165b860 --- /dev/null +++ b/scripts/plot_losses.py @@ -0,0 +1,30 @@ +from gym import LEGGED_GYM_ROOT_DIR + +import os +import matplotlib.pyplot as plt +import pandas as pd + +ROOT_DIR = f"{LEGGED_GYM_ROOT_DIR}/logs/mini_cheetah_ref/" +LOAD_RUN = "Jul26_11-58-37_LinkedIPG_100Hz_nu02_v08" + +PLOT_LOSSES = { + "Nu=0.9 sim": "losses_nu09_sim.csv", + "Nu=0.9 no sim": "losses_nu09_nosim.csv", +} + +LABELS = ["value_loss", "surrogate_loss", "q_loss", "offpol_loss"] + +fig, axs = plt.subplots(2, 2, figsize=(10, 6)) +plt.suptitle(" IPG Finetuning Losses") + +for i, label in enumerate(LABELS): + for name, file in PLOT_LOSSES.items(): + data = pd.read_csv(os.path.join(ROOT_DIR, LOAD_RUN, file)) + axs[i // 2, i % 2].plot(data[label], label=name) + axs[i // 2, i % 2].set_title(label) + axs[i // 2, i % 2].set_xlabel("Checkpoint") + axs[i // 2, i % 2].set_ylabel("Loss") + axs[i // 2, i % 2].legend() + +plt.tight_layout() +plt.show()