From dd260df75f48fcc3619b76e348092359cb1dbb6b Mon Sep 17 00:00:00 2001 From: Nicola Irmiger Date: Wed, 17 Apr 2024 11:23:43 -0400 Subject: [PATCH 1/4] pink noise --- gym/envs/cartpole/cartpole_config.py | 3 + .../mini_cheetah/mini_cheetah_ref_config.py | 3 +- learning/modules/actor_critic.py | 13 +++ learning/modules/pink_actor.py | 104 ++++++++++++++++++ plots/plot.py | 2 +- 5 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 learning/modules/pink_actor.py diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index d8a19292..f123ef7b 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -74,6 +74,9 @@ class policy(FixedRobotCfgPPO.policy): actor_hidden_dims = [num_units] * num_layers critic_hidden_dims = [num_units] * num_layers activation = "elu" + pink_exploration = True + smooth_exploration = False + exploration_sample_freq = 8 actor_obs = [ "cart_obs", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index fda3f345..eaec015b 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,8 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = True + smooth_exploration = False + pink_exploration = True exploration_sample_freq = 8 actor_obs = [ diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 29f6cdf7..6d00a458 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -3,6 +3,7 @@ from .actor import Actor from .smooth_actor import SmoothActor from .critic import Critic +from .pink_actor import PinkActor class ActorCritic(nn.Module): @@ -17,6 +18,7 @@ def __init__( init_noise_std=1.0, normalize_obs=True, smooth_exploration=False, + pink_exploration=False, **kwargs, ): if kwargs: @@ -35,6 +37,17 @@ def __init__( init_noise_std, normalize_obs, ) + + elif pink_exploration: + self.actor = PinkActor( + num_actor_obs, + num_actions, + actor_hidden_dims, + activation, + init_noise_std, + normalize_obs, + ) + else: self.actor = Actor( num_actor_obs, diff --git a/learning/modules/pink_actor.py b/learning/modules/pink_actor.py new file mode 100644 index 00000000..e4bc96ca --- /dev/null +++ b/learning/modules/pink_actor.py @@ -0,0 +1,104 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal +import numpy as np +from pink import ColoredNoiseProcess + +from .actor import Actor +from .utils import create_MLP + +from gym import LEGGED_GYM_ROOT_DIR + + +# The following implementation is based on the pinkNoise paper. See code: +# https://github.com/martius-lab/pink-noise-rl/blob/main/pink/sb3.py +class PinkActor(Actor): + _latent_sde: torch.Tensor + + def __init__( + self, + *args, + full_std: bool = True, + use_exp_ln: bool = False, + learn_features: bool = True, + epsilon: float = 1e-6, + log_std_init: float = -0.5, + beta=0.5, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.full_std = full_std + self.use_exp_ln = use_exp_ln + self.learn_features = learn_features + self.epsilon = epsilon + self.log_std_init = log_std_init + self.beta = beta + + # TODO: is 500 correct? + self.gen = ColoredNoiseProcess(beta=self.beta, size=(self.num_actions, 1000)) + + # Create latent NN and last layer + self.latent_net = create_MLP( + self.num_obs, + self.num_actions, + self.hidden_dims, + self.activation, + latent=True, + ) + self.latent_dim = self.hidden_dims[-1] + + self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + + self.log_std = nn.Parameter( + torch.ones(self.num_actions) * log_std_init, requires_grad=True + ) + + self.distribution = None + + # Debug mode for plotting + self.debug = True + + def update_distribution(self, observations): + if self._normalize_obs: + observations = self.normalize(observations) + # Get latent features and compute distribution + self._latent_sde = self.latent_net(observations) + if not self.learn_features: + self._latent_sde = self._latent_sde.detach() + mean_actions = self.mean_actions_net(self._latent_sde) + action_std = torch.ones_like(mean_actions) * torch.exp(self.log_std) + self.distribution = Normal(mean_actions, action_std) + + # TODO[ni]: Sample actions that do not fit into the distribution at all sometimes + def act(self, observations): + self.update_distribution(observations) + if np.isscalar(self.beta): + cn_sample = torch.tensor(self.gen.sample()).float() + else: + cn_sample = torch.tensor([cnp.sample() for cnp in self.gen]).float() + + mean = self.distribution.mean + cn_sample = cn_sample.to(self.log_std.device) + + sample = mean + torch.exp(self.log_std) * cn_sample + if self.debug: + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_pink.csv" + self.log_actions(mean[0][2], sample[0][2], path) + return torch.tanh(sample) + + def act_inference(self, observations): + if self._normalize_obs: + observations = self.normalize(observations) + latent_sde = self.latent_net(observations) + mean_actions = self.mean_actions_net(latent_sde) + return mean_actions + + def get_actions_log_prob(self, actions): + eps = torch.finfo(actions.dtype).eps + gaussian_actions = actions.clamp(min=-1.0 + eps, max=1.0 - eps) + gaussian_actions = 0.5 * ( + gaussian_actions.log1p() - (-gaussian_actions).log1p() + ) + log_prob = super().get_actions_log_prob(gaussian_actions) + log_prob -= torch.sum(torch.log(1 - actions**2 + self.epsilon), dim=1) + return log_prob diff --git a/plots/plot.py b/plots/plot.py index 5018cb4d..4f33b421 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt # Read the CSV file -name = "distribution_smooth" +name = "plots/distribution_pink" data = pd.read_csv(name + ".csv") # Plot the data (last n steps) From 4e9d1d09dd027c2685ef95495b934909d37db57e Mon Sep 17 00:00:00 2001 From: Nicola Irmiger Date: Fri, 19 Apr 2024 17:02:45 -0400 Subject: [PATCH 2/4] Clean up PinkActor --- learning/modules/pink_actor.py | 52 +++++----------------------------- 1 file changed, 7 insertions(+), 45 deletions(-) diff --git a/learning/modules/pink_actor.py b/learning/modules/pink_actor.py index e4bc96ca..b88933fe 100644 --- a/learning/modules/pink_actor.py +++ b/learning/modules/pink_actor.py @@ -5,7 +5,6 @@ from pink import ColoredNoiseProcess from .actor import Actor -from .utils import create_MLP from gym import LEGGED_GYM_ROOT_DIR @@ -13,48 +12,26 @@ # The following implementation is based on the pinkNoise paper. See code: # https://github.com/martius-lab/pink-noise-rl/blob/main/pink/sb3.py class PinkActor(Actor): - _latent_sde: torch.Tensor - def __init__( self, *args, - full_std: bool = True, - use_exp_ln: bool = False, - learn_features: bool = True, epsilon: float = 1e-6, - log_std_init: float = -0.5, + log_std_init: float = 0.0, beta=0.5, **kwargs, ): super().__init__(*args, **kwargs) - self.full_std = full_std - self.use_exp_ln = use_exp_ln - self.learn_features = learn_features self.epsilon = epsilon self.log_std_init = log_std_init self.beta = beta - # TODO: is 500 correct? - self.gen = ColoredNoiseProcess(beta=self.beta, size=(self.num_actions, 1000)) - - # Create latent NN and last layer - self.latent_net = create_MLP( - self.num_obs, - self.num_actions, - self.hidden_dims, - self.activation, - latent=True, - ) - self.latent_dim = self.hidden_dims[-1] - - self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + # TODO[ni]: get control frequency and episode time from config + self.gen = ColoredNoiseProcess(beta=self.beta, size=(self.num_actions, 500)) self.log_std = nn.Parameter( torch.ones(self.num_actions) * log_std_init, requires_grad=True ) - self.distribution = None - # Debug mode for plotting self.debug = True @@ -62,14 +39,10 @@ def update_distribution(self, observations): if self._normalize_obs: observations = self.normalize(observations) # Get latent features and compute distribution - self._latent_sde = self.latent_net(observations) - if not self.learn_features: - self._latent_sde = self._latent_sde.detach() - mean_actions = self.mean_actions_net(self._latent_sde) + mean_actions = self.NN(observations) action_std = torch.ones_like(mean_actions) * torch.exp(self.log_std) self.distribution = Normal(mean_actions, action_std) - # TODO[ni]: Sample actions that do not fit into the distribution at all sometimes def act(self, observations): self.update_distribution(observations) if np.isscalar(self.beta): @@ -83,22 +56,11 @@ def act(self, observations): sample = mean + torch.exp(self.log_std) * cn_sample if self.debug: path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_pink.csv" - self.log_actions(mean[0][2], sample[0][2], path) - return torch.tanh(sample) + self.log_actions(mean[0][0], sample[0][0], path) + return sample def act_inference(self, observations): if self._normalize_obs: observations = self.normalize(observations) - latent_sde = self.latent_net(observations) - mean_actions = self.mean_actions_net(latent_sde) + mean_actions = self.NN(observations) return mean_actions - - def get_actions_log_prob(self, actions): - eps = torch.finfo(actions.dtype).eps - gaussian_actions = actions.clamp(min=-1.0 + eps, max=1.0 - eps) - gaussian_actions = 0.5 * ( - gaussian_actions.log1p() - (-gaussian_actions).log1p() - ) - log_prob = super().get_actions_log_prob(gaussian_actions) - log_prob -= torch.sum(torch.log(1 - actions**2 + self.epsilon), dim=1) - return log_prob From 7309ec0b1b9c96a58157889d112686570c2ce986 Mon Sep 17 00:00:00 2001 From: Nicola Irmiger Date: Fri, 26 Apr 2024 16:28:57 -0400 Subject: [PATCH 3/4] Update to PPO2 --- learning/modules/__init__.py | 1 + learning/modules/pink_actor.py | 6 ++++-- learning/runners/BaseRunner.py | 4 +++- learning/runners/old_policy_runner.py | 4 +++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 410e2f25..79ac2748 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -35,3 +35,4 @@ from .actor import Actor from .critic import Critic from .smooth_actor import SmoothActor +from .pink_actor import PinkActor diff --git a/learning/modules/pink_actor.py b/learning/modules/pink_actor.py index b88933fe..6111694b 100644 --- a/learning/modules/pink_actor.py +++ b/learning/modules/pink_actor.py @@ -37,7 +37,8 @@ def __init__( def update_distribution(self, observations): if self._normalize_obs: - observations = self.normalize(observations) + with torch.no_grad(): + observations = self.obs_rms(observations) # Get latent features and compute distribution mean_actions = self.NN(observations) action_std = torch.ones_like(mean_actions) * torch.exp(self.log_std) @@ -61,6 +62,7 @@ def act(self, observations): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) + with torch.no_grad(): + observations = self.obs_rms(observations) mean_actions = self.NN(observations) return mean_actions diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 39ce336d..f2a9508c 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic, SmoothActor +from learning.modules import Actor, Critic, SmoothActor, PinkActor from learning.utils import remove_zero_weighted_rewards @@ -24,6 +24,8 @@ def _set_up_alg(self): num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) if self.actor_cfg["smooth_exploration"]: actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + elif self.actor_cfg["pink_exploration"]: + actor = PinkActor(num_actor_obs, num_actions, **self.actor_cfg) else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 8b58dbe4..6c863adb 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -4,7 +4,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner from learning.algorithms import PPO # noqa: F401 -from learning.modules import ActorCritic, Actor, Critic, SmoothActor +from learning.modules import ActorCritic, Actor, Critic, SmoothActor, PinkActor logger = Logger() @@ -26,6 +26,8 @@ def _set_up_alg(self): num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) if self.actor_cfg["smooth_exploration"]: actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + elif self.actor_cfg["pink_exploration"]: + actor = PinkActor(num_actor_obs, num_actions, **self.actor_cfg) else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) From 42d9456865fb0adf9abbcd251ba9e364f0fc430d Mon Sep 17 00:00:00 2001 From: Nicola Irmiger Date: Mon, 10 Jun 2024 11:17:09 -0400 Subject: [PATCH 4/4] Final commit --- gym/envs/cartpole/cartpole_config.py | 9 +- .../mini_cheetah/mini_cheetah_ref_config.py | 11 +- .../calculate_smoothness.py | 97 ++++++++++ gym/exploration_analysis/plot_ft.py | 106 +++++++++++ .../plot_play.py | 0 gym/exploration_analysis/plot_power.py | 77 ++++++++ .../plot_train.py | 7 +- gym/exploration_analysis/plot_variance.py | 167 ++++++++++++++++++ gym/smooth_exploration/plot_ft.py | 56 ------ learning/modules/__init__.py | 2 +- .../{pink_actor.py => colored_actor.py} | 39 ++-- learning/modules/smooth_actor.py | 2 + learning/runners/BaseRunner.py | 10 +- learning/runners/old_policy_runner.py | 17 +- learning/runners/on_policy_runner.py | 38 ++-- scripts/log_play.py | 2 +- scripts/log_train.py | 11 +- .../sweep_colored_exploration.json | 16 ++ 18 files changed, 546 insertions(+), 121 deletions(-) create mode 100644 gym/exploration_analysis/calculate_smoothness.py create mode 100644 gym/exploration_analysis/plot_ft.py rename gym/{smooth_exploration => exploration_analysis}/plot_play.py (100%) create mode 100644 gym/exploration_analysis/plot_power.py rename gym/{smooth_exploration => exploration_analysis}/plot_train.py (91%) create mode 100644 gym/exploration_analysis/plot_variance.py delete mode 100644 gym/smooth_exploration/plot_ft.py rename learning/modules/{pink_actor.py => colored_actor.py} (61%) create mode 100644 scripts/sweep_configs/sweep_colored_exploration.json diff --git a/gym/envs/cartpole/cartpole_config.py b/gym/envs/cartpole/cartpole_config.py index fd47b415..1cf8d250 100644 --- a/gym/envs/cartpole/cartpole_config.py +++ b/gym/envs/cartpole/cartpole_config.py @@ -73,9 +73,12 @@ class policy(FixedRobotCfgPPO.policy): num_units = 32 hidden_dims = [num_units] * num_layers activation = "elu" - pink_exploration = True - smooth_exploration = False - exploration_sample_freq = 8 + + class exploration: + # Type of actor, can be "smooth", "colored" or "white" + type = "colored" + sample_freq = 8 + beta = 1.0 obs = [ "cart_obs", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index b8642131..b4b5ee7e 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,9 +73,14 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = False - pink_exploration = True - exploration_sample_freq = 8 + + class exploration: + # Type of actor, can be "smooth", "colored" or "white" + type = "colored" + sample_freq = 16 + beta = 1 + # For sweep + log_std_init = 0.0 normalize_obs = True obs = [ diff --git a/gym/exploration_analysis/calculate_smoothness.py b/gym/exploration_analysis/calculate_smoothness.py new file mode 100644 index 00000000..8475edb0 --- /dev/null +++ b/gym/exploration_analysis/calculate_smoothness.py @@ -0,0 +1,97 @@ +import numpy as np + +# Change signal to 500 steps +smooth_name = "mini_cheetah_ref_smooth_16" +baseline_name = "mini_cheetah_ref" +colored_name = "mini_cheetah_ref_colored_1" + +colored_data_dir = "./data_train/" + colored_name +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name + +# load data +smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] +baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] +colored_pos_target = np.load(colored_data_dir + "/dof_pos_target.npy")[0] +colored_terminated = np.load(colored_data_dir + "/terminated.npy")[0] + +# compute FFT averages +smooth_squared_deltas = [[], [], []] +colored_squared_deltas = [[], [], []] +baseline_squared_deltas = [[], [], []] +for it in range(0, baseline_pos_target.shape[0], 50): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(3): + squared_deltas = ( + smooth_pos_target[it, 1:, idx] - smooth_pos_target[it, :-1, idx] + ) ** 2 + smooth_squared_deltas[idx].append(squared_deltas) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(3): + squared_deltas = ( + baseline_pos_target[it, 1:, idx] - baseline_pos_target[it, :-1, idx] + ) ** 2 + baseline_squared_deltas[idx].append(squared_deltas) + + if not np.any(colored_terminated[it, :, 0]): + for idx in range(3): + squared_deltas = ( + colored_pos_target[it, 1:, idx] - colored_pos_target[it, :-1, idx] + ) ** 2 + colored_squared_deltas[idx].append(squared_deltas) + +smooth_squared_deltas_array = np.array(smooth_squared_deltas) +baseline_squared_deltas_array = np.array(baseline_squared_deltas) +colored_squared_deltas_array = np.array(colored_squared_deltas) + +# Find the maximum value of each array +max_smooth = np.max(smooth_squared_deltas_array) +max_baseline = np.max(baseline_squared_deltas_array) +max_colored = np.max(colored_squared_deltas_array) + +# Find the maximum value among the three arrays +max_squared_value = max(max_smooth, max_baseline, max_colored) + +smooth_squared_deltas_scaled = np.divide( + smooth_squared_deltas_array[:, 0, :], max_squared_value +) +baseline_squared_deltas_scaled = np.divide( + baseline_squared_deltas_array[:, 0, :], max_squared_value +) +colored_squared_deltas_scaled = np.divide( + colored_squared_deltas_array[:, 0, :], max_squared_value +) + +# Calculate the mean of each scaled array +mean_smooth = np.mean(smooth_squared_deltas_scaled) +mean_baseline = np.mean(baseline_squared_deltas_scaled) +mean_colored = np.mean(colored_squared_deltas_scaled) + +# Print the mean of each scaled array +print(f"The mean of the scaled smooth_squared_deltas array is {mean_smooth*100}") +print(f"The mean of the scaled baseline_squared_deltas array is {mean_baseline*100}") +print(f"The mean of the scaled colored_squared_deltas array is {mean_colored*100}") + +smooth_squared_deltas_scaled = np.divide( + smooth_squared_deltas_array[:, -1, :], max_squared_value +) +baseline_squared_deltas_scaled = np.divide( + baseline_squared_deltas_array[:, -1, :], max_squared_value +) +colored_squared_deltas_scaled = np.divide( + colored_squared_deltas_array[:, -1, :], max_squared_value +) + +# Calculate the mean of each scaled array +mean_smooth = np.mean(smooth_squared_deltas_scaled) +mean_baseline = np.mean(baseline_squared_deltas_scaled) +mean_colored = np.mean(colored_squared_deltas_scaled) + +# Print the mean of each scaled array +print(f"The mean of the scaled smooth_squared_deltas array is {mean_smooth*100}") +print(f"The mean of the scaled baseline_squared_deltas array is {mean_baseline*100}") +print(f"The mean of the scaled colored_squared_deltas array is {mean_colored*100}") diff --git a/gym/exploration_analysis/plot_ft.py b/gym/exploration_analysis/plot_ft.py new file mode 100644 index 00000000..63835ccd --- /dev/null +++ b/gym/exploration_analysis/plot_ft.py @@ -0,0 +1,106 @@ +import numpy as np +import matplotlib.pyplot as plt +import os + +# Change signal to 500 steps +smooth_name = "mini_cheetah_ref_smooth_16" +baseline_name = "mini_cheetah_ref" +colored_name = "mini_cheetah_ref_colored_1" + +colored_data_dir = "./data_train/" + colored_name +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name +fig_dir = "./figures_train/" + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + +# load data +smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] +baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] +colored_pos_target = np.load(colored_data_dir + "/dof_pos_target.npy")[0] +colored_terminated = np.load(colored_data_dir + "/terminated.npy")[0] + +# compute FFT averages +smooth_ffts = [[], [], [], [], [], [], [], [], [], [], [], []] +colored_ffts = [[], [], [], [], [], [], [], [], [], [], [], []] +baseline_ffts = [[], [], [], [], [], [], [], [], [], [], [], []] +for it in range(0, baseline_pos_target.shape[0], 50): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(12): + fft = np.fft.fft(smooth_pos_target[it, :, idx]) + smooth_ffts[idx].append(fft[: len(fft) // 2]) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(12): + fft = np.fft.fft(baseline_pos_target[it, :, idx]) + baseline_ffts[idx].append(fft[: len(fft) // 2]) + + if not np.any(colored_terminated[it, :, 0]): + for idx in range(12): + fft = np.fft.fft(colored_pos_target[it, :, idx]) + colored_ffts[idx].append(fft[: len(fft) // 2]) + +print(f"Total smooth FFTS: {len(smooth_ffts[0])}") +print(f"Total baseline FFTS: {len(baseline_ffts[0])}") +print(f"Total colored FFTS: {len(colored_ffts[0])}") + +smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(12)] +baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(12)] +colored_fft_means = [np.array(colored_ffts[idx]).mean(axis=0) for idx in range(12)] + + +def moving_average(a, n=3): + ret = np.cumsum(a, dtype=float) + ret[n:] = ret[n:] - ret[:-n] + return ret[n - 1 :] / n + + +x_values = np.linspace(0, 50, 498) +# plot FFTs +fig, axs = plt.subplots(1, 2, figsize=(10, 4)) +for idx in range(2): + colored_smooth_start = moving_average( + np.array(np.abs(colored_ffts))[:, 0, :].mean(axis=0) + ) + baseline_smooth_start = moving_average( + np.array(np.abs(baseline_ffts))[:, 0, :].mean(axis=0) + ) + sde_smooth_start = moving_average( + np.array(np.abs(smooth_ffts))[:, 0, :].mean(axis=0) + ) + colored_smooth_end = moving_average( + np.array(np.abs(colored_ffts))[:, -1, :].mean(axis=0) + ) + baseline_smooth_end = moving_average( + np.array(np.abs(baseline_ffts))[:, -1, :].mean(axis=0) + ) + sde_smooth_end = moving_average( + np.array(np.abs(smooth_ffts))[:, -1, :].mean(axis=0) + ) + + if idx == 0: + axs[idx].plot(x_values, colored_smooth_start, label="Pink", color="blue") + axs[idx].plot(x_values, baseline_smooth_start, label="Baseline", color="green") + axs[idx].plot(x_values, sde_smooth_start, label="gSDE-16", color="red") + axs[idx].set_title("Fourier Transform at the Beginning of Training") + axs[idx].set_xlabel("Frequency [Hz]") + axs[idx].set_ylabel("Amplitude") + axs[idx].legend() + axs[idx].set_ylim([-1, 40]) + + else: + axs[idx].plot(x_values, colored_smooth_end, label="Pink", color="blue") + axs[idx].plot(x_values, baseline_smooth_end, label="Baseline", color="green") + axs[idx].plot(x_values, sde_smooth_end, label="gSDE-16", color="red") + axs[idx].set_title("Fourier Transform at the End of Training") + axs[idx].set_xlabel("Frequency [Hz]") + axs[idx].set_ylabel("Amplitude") + axs[idx].legend() + axs[idx].set_ylim([-1, 40]) + +fig.tight_layout() +fig.savefig(fig_dir + "/" + "fourier.png") diff --git a/gym/smooth_exploration/plot_play.py b/gym/exploration_analysis/plot_play.py similarity index 100% rename from gym/smooth_exploration/plot_play.py rename to gym/exploration_analysis/plot_play.py diff --git a/gym/exploration_analysis/plot_power.py b/gym/exploration_analysis/plot_power.py new file mode 100644 index 00000000..e38e7ea8 --- /dev/null +++ b/gym/exploration_analysis/plot_power.py @@ -0,0 +1,77 @@ +import numpy as np +import os + +smooth_name = "mini_cheetah_ref_smooth_16" +baseline_name = "mini_cheetah_ref" +colored_name = "mini_cheetah_ref_colored_1" + +colored_data_dir = "./data_train/" + colored_name +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name +fig_dir = "./figures_train/" + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + +# load data +smooth_dof_vel = np.load(smooth_data_dir + "/dof_vel.npy")[0] +baseline_dof_vel = np.load(baseline_data_dir + "/dof_vel.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] +colored_dof_vel = np.load(colored_data_dir + "/dof_vel.npy")[0] +colored_terminated = np.load(colored_data_dir + "/terminated.npy")[0] +smooth_torques = np.load(smooth_data_dir + "/torques.npy")[0] +baseline_torques = np.load(baseline_data_dir + "/torques.npy")[0] +colored_torques = np.load(colored_data_dir + "/torques.npy")[0] + +smooth_power = [[], [], [], [], [], [], [], [], [], [], [], []] +colored_power = [[], [], [], [], [], [], [], [], [], [], [], []] +baseline_power = [[], [], [], [], [], [], [], [], [], [], [], []] +for it in range(0, smooth_dof_vel.shape[0], 50): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(12): + smooth_power[idx].append( + np.abs( + np.multiply(smooth_dof_vel[it, :, idx], smooth_torques[it, :, idx]) + ) + ) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(12): + baseline_power[idx].append( + np.abs( + np.multiply( + baseline_dof_vel[it, :, idx], baseline_torques[it, :, idx] + ) + ) + ) + + if not np.any(colored_terminated[it, :, 0]): + for idx in range(12): + colored_power[idx].append( + np.abs( + np.multiply( + colored_dof_vel[it, :, idx], colored_torques[it, :, idx] + ) + ) + ) + +print(f"Total smooth: {len(smooth_power[0])}") +print(f"Total baseline: {len(baseline_power[0])}") +print(f"Total colored: {len(colored_power[0])}") + +power_values = [ + np.array(smooth_power), + np.array(baseline_power), + np.array(colored_power), +] + +# Calculate mean power at the beginning and end of training +# Calculate mean power at the beginning and end of training + +mean_power_beginning = [power[:, 0, :].mean() for power in power_values] +mean_power_end = [power[:, -1, :].mean() for power in power_values] + +print(mean_power_beginning) +print(mean_power_end) diff --git a/gym/smooth_exploration/plot_train.py b/gym/exploration_analysis/plot_train.py similarity index 91% rename from gym/smooth_exploration/plot_train.py rename to gym/exploration_analysis/plot_train.py index 58c50396..d923f8e5 100644 --- a/gym/smooth_exploration/plot_train.py +++ b/gym/exploration_analysis/plot_train.py @@ -7,7 +7,7 @@ SAMPLE_FREQ = 16 STEPS = 1000 -name = "ref_sample_16_len_1000" +name = "mini_cheetah_ref_colored_0.5" data_dir = "./data_train/" + name fig_dir = "./figures_train/" + name @@ -29,16 +29,11 @@ def plot_fourier(data, it): ft = np.fft.fft(data[:, i]) ft_half = ft[: len(ft) // 2] axs_ft[0].plot(np.abs(ft_half)) - axs_ft[1].plot(np.angle(ft_half)) axs_ft[0].set_title("FT Amplitude") axs_ft[0].set_xlabel("Frequency") axs_ft[0].set_ylabel("Amplitude") axs_ft[0].legend(["idx 0", "idx 1", "idx 2"]) - axs_ft[1].set_title("FT Phase") - axs_ft[1].set_xlabel("Frequency") - axs_ft[1].set_ylabel("Phase") - axs_ft[1].legend(["idx 0", "idx 1", "idx 2"]) fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png") diff --git a/gym/exploration_analysis/plot_variance.py b/gym/exploration_analysis/plot_variance.py new file mode 100644 index 00000000..3628ef75 --- /dev/null +++ b/gym/exploration_analysis/plot_variance.py @@ -0,0 +1,167 @@ +import numpy as np +import matplotlib.pyplot as plt +import os +import seaborn as sns + +WINDOW_SIZE = 20 + +smooth_name = "mini_cheetah_ref_smooth_16" +baseline_name = "mini_cheetah_ref" +colored_name = "mini_cheetah_ref_colored_1" + +colored_data_dir = "./data_train/" + colored_name +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name +fig_dir = "./figures_train/" + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + + +def rolling_variance(signal, window_size): + window = np.ones(window_size) / window_size + signal_mean = np.convolve(signal, window, "valid") + signal_sqr = np.convolve(signal**2, window, "valid") + return signal_sqr - signal_mean**2 + + +# load data +smooth_pos_obs = np.load(smooth_data_dir + "/dof_pos_obs.npy")[0] +baseline_pos_obs = np.load(baseline_data_dir + "/dof_pos_obs.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] +colored_pos_obs = np.load(colored_data_dir + "/dof_pos_obs.npy")[0] +colored_terminated = np.load(colored_data_dir + "/terminated.npy")[0] +smooth_dof_vel = np.load(smooth_data_dir + "/dof_vel.npy")[0] +baseline_dof_vel = np.load(baseline_data_dir + "/dof_vel.npy")[0] +colored_dof_vel = np.load(colored_data_dir + "/dof_vel.npy")[0] + +# compute variance averages +smooth_vars = [[], [], [], [], [], [], [], [], [], [], [], []] +colored_vars = [[], [], [], [], [], [], [], [], [], [], [], []] +baseline_vars = [[], [], [], [], [], [], [], [], [], [], [], []] +total_smooth_var = [[], [], [], [], [], [], [], [], [], []] +total_baseline_var = [[], [], [], [], [], [], [], [], [], []] +total_colored_var = [[], [], [], [], [], [], [], [], [], []] + +i = 0 +for it in range(0, smooth_pos_obs.shape[0], 50): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(12): + # var = rolling_variance(smooth_pos_obs[it, :, idx], WINDOW_SIZE) + total_var_obs = np.var(smooth_pos_obs[it, :, idx]) + total_var_vel = np.var(smooth_dof_vel[it, :, idx]) + total_smooth_var[i].append(total_var_obs) + total_smooth_var[i].append(total_var_vel) + # smooth_vars[idx].append(var) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(12): + # var = rolling_variance(baseline_pos_obs[it, :, idx], WINDOW_SIZE) + total_var_obs = np.var(baseline_pos_obs[it, :, idx]) + total_var_vel = np.var(baseline_dof_vel[it, :, idx]) + total_baseline_var[i].append(total_var_obs) + total_baseline_var[i].append(total_var_vel) + # baseline_vars[idx].append(var) + + if not np.any(colored_terminated[it, :, 0]): + for idx in range(12): + # var = rolling_variance(colored_pos_obs[it, :, idx], WINDOW_SIZE) + total_var_obs = np.var(colored_pos_obs[it, :, idx]) + total_var_vel = np.var(colored_dof_vel[it, :, idx]) + total_colored_var[i].append(total_var_obs) + total_colored_var[i].append(total_var_vel) + # colored_vars[idx].append(var) + i += 1 + +# print(f"Total smooth variance: {len(smooth_vars[0])}") +# print(f"Total baseline variance: {len(baseline_vars[0])}") +# print(f"Total colored variance: {len(colored_vars[0])}") + +# smooth_var_means = [np.array(smooth_vars[idx]).mean(axis=0) for idx in range(3)] +# baseline_var_means = [np.array(baseline_vars[idx]).mean(axis=0) for idx in range(3)] +# colored_var_means = [np.array(colored_vars[idx]).mean(axis=0) for idx in range(3)] + +# plot Variance +# fig, axs = plt.subplots(3, 1, figsize=(10, 10)) +# for idx in range(3): +# axs[idx].plot(smooth_var_means[idx], label="smooth") +# axs[idx].plot(baseline_var_means[idx], label="baseline") +# axs[idx].plot(colored_var_means[idx], label="colored") +# axs[idx].set_title(f"Variance Amplitude idx {idx}") +# axs[idx].set_xlabel("Step") +# axs[idx].set_ylabel("Amplitude") +# axs[idx].legend() + +# fig.tight_layout() +# fig.savefig(fig_dir + "/" + "variance.png") + +# Generate x-axis labels for each array +array_labels = ["Axis {}".format(i) for i in range(1, 25)] + +# Plotting +plt.figure(figsize=(10, 6)) + +# Plot variances for baseline +plt.plot( + array_labels, + total_baseline_var[9], + marker="o", + linestyle="-", + color="green", + label="Baseline", +) + +# Plot variances for method 1 +plt.plot( + array_labels, + total_smooth_var[9], + marker="o", + linestyle="-", + color="red", + label="Smooth", +) + +# Plot variances for method 2 +plt.plot( + array_labels, + total_colored_var[9], + marker="o", + linestyle="-", + color="blue", + label="Colored", +) + +# Add labels and legend +plt.xlabel("Axis") +plt.ylabel("Variance") +plt.title("Variances of Axis Across Different Implementations") +plt.xticks(rotation=45) +plt.legend() +plt.grid(True) +plt.tight_layout() +plt.savefig(fig_dir + "/" + "variance_2.png") + +# Combine variances into a single dataset +variances_combined = [total_baseline_var[9], total_smooth_var[9], total_colored_var[9]] + +# Create a boxplot +plt.figure(figsize=(10, 6)) +sns.boxplot(data=variances_combined) +plt.xlabel("Method") +plt.ylabel("Variance") +plt.title("Distribution of Variances Across Methods (Boxplot)") +plt.xticks(ticks=[0, 1, 2], labels=["Baseline", "Smooth", "Colored"]) +plt.grid(True) +plt.show() + +# Create a violin plot +plt.figure(figsize=(10, 6)) +sns.violinplot(data=variances_combined) +plt.xlabel("Method") +plt.ylabel("Variance") +plt.title("Distribution of Variances Across Methods (Violin Plot)") +plt.xticks(ticks=[0, 1, 2], labels=["Baseline", "Smooth", "Colored"]) +plt.grid(True) +plt.show() diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py deleted file mode 100644 index 5ad86b63..00000000 --- a/gym/smooth_exploration/plot_ft.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -import os - -SAMPLE_FREQ = 8 -STEPS = 500 - -smooth_name = "ref_sample_8_len_500" -baseline_name = "ref_baseline_len_500" - -smooth_data_dir = "./data_train/" + smooth_name -baseline_data_dir = "./data_train/" + baseline_name -fig_dir = "./figures_train/" - -if not os.path.exists(fig_dir): - os.makedirs(fig_dir) - -# load data -smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] -baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] -smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] -baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] - -# compute FFT averages -smooth_ffts = [[], [], []] -baseline_ffts = [[], [], []] -for it in range(0, smooth_pos_target.shape[0], 10): - # only use data that didn't terminate - if not np.any(smooth_terminated[it, :, 0]): - for idx in range(3): - fft = np.fft.fft(smooth_pos_target[it, :, idx]) - smooth_ffts[idx].append(fft[: len(fft) // 2]) - - if not np.any(baseline_terminated[it, :, 0]): - for idx in range(3): - fft = np.fft.fft(baseline_pos_target[it, :, idx]) - baseline_ffts[idx].append(fft[: len(fft) // 2]) - -print(f"Total smooth FFTS: {len(smooth_ffts[0])}") -print(f"Total baseline FFTS: {len(baseline_ffts[0])}") - -smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)] -baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)] - -# plot FFTs -fig, axs = plt.subplots(3, 1, figsize=(10, 10)) -for idx in range(3): - axs[idx].plot(np.abs(smooth_fft_means[idx])) - axs[idx].plot(np.abs(baseline_fft_means[idx])) - axs[idx].set_title(f"FT Amplitude idx {idx}") - axs[idx].set_xlabel("Frequency") - axs[idx].set_ylabel("Amplitude") - axs[idx].legend(["smooth", "baseline"]) - -fig.tight_layout() -fig.savefig(fig_dir + "/" + smooth_name + ".png") diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 79ac2748..03fd03cc 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -35,4 +35,4 @@ from .actor import Actor from .critic import Critic from .smooth_actor import SmoothActor -from .pink_actor import PinkActor +from .colored_actor import ColoredActor diff --git a/learning/modules/pink_actor.py b/learning/modules/colored_actor.py similarity index 61% rename from learning/modules/pink_actor.py rename to learning/modules/colored_actor.py index 6111694b..9e0ef462 100644 --- a/learning/modules/pink_actor.py +++ b/learning/modules/colored_actor.py @@ -1,7 +1,6 @@ import torch import torch.nn as nn from torch.distributions import Normal -import numpy as np from pink import ColoredNoiseProcess from .actor import Actor @@ -11,30 +10,33 @@ # The following implementation is based on the pinkNoise paper. See code: # https://github.com/martius-lab/pink-noise-rl/blob/main/pink/sb3.py -class PinkActor(Actor): +class ColoredActor(Actor): def __init__( self, *args, + num_envs, epsilon: float = 1e-6, - log_std_init: float = 0.0, - beta=0.5, **kwargs, ): super().__init__(*args, **kwargs) + if "sample_freq" in kwargs["exploration"]: + print("sample_freq is not used in ColoredActor") self.epsilon = epsilon - self.log_std_init = log_std_init - self.beta = beta + self.log_std_init = kwargs["exploration"]["log_std_init"] + self.beta = kwargs["exploration"]["beta"] - # TODO[ni]: get control frequency and episode time from config - self.gen = ColoredNoiseProcess(beta=self.beta, size=(self.num_actions, 500)) + self.num_envs = num_envs + self.gen = [None] * self.num_envs + horizon = 500 # This is the control frequency times the episode length + for i in range(self.num_envs): + self.gen[i] = ColoredNoiseProcess( + beta=self.beta, size=(self.num_actions, horizon) + ) self.log_std = nn.Parameter( - torch.ones(self.num_actions) * log_std_init, requires_grad=True + torch.ones(self.num_actions) * self.log_std_init, requires_grad=True ) - # Debug mode for plotting - self.debug = True - def update_distribution(self, observations): if self._normalize_obs: with torch.no_grad(): @@ -46,15 +48,14 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - if np.isscalar(self.beta): - cn_sample = torch.tensor(self.gen.sample()).float() - else: - cn_sample = torch.tensor([cnp.sample() for cnp in self.gen]).float() - + cn_sample = self.num_envs * [torch.zeros(self.num_actions, 500)] + for i in range(self.num_envs): + cn_sample[i] = torch.tensor(self.gen[i].sample()).float() + # Send cn_sample to the device + cn_sample = torch.stack(cn_sample).to(self.log_std.device) mean = self.distribution.mean - cn_sample = cn_sample.to(self.log_std.device) - sample = mean + torch.exp(self.log_std) * cn_sample + if self.debug: path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_pink.csv" self.log_actions(mean[0][0], sample[0][0], path) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index dae53c9e..4ff7d154 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -26,6 +26,8 @@ def __init__( **kwargs, ): super().__init__(*args, **kwargs) + if "beta" in kwargs["exploration"]: + print("beta is not used in SmoothActor") self.full_std = full_std self.use_exp_ln = use_exp_ln self.learn_features = learn_features diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index f2a9508c..5f63c76f 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic, SmoothActor, PinkActor +from learning.modules import Actor, Critic, SmoothActor, ColoredActor from learning.utils import remove_zero_weighted_rewards @@ -22,10 +22,12 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - if self.actor_cfg["smooth_exploration"]: + if self.actor_cfg["exploration"]["type"] == "smooth": actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) - elif self.actor_cfg["pink_exploration"]: - actor = PinkActor(num_actor_obs, num_actions, **self.actor_cfg) + elif self.actor_cfg["exploration"]["type"] == "colored": + actor = ColoredActor( + num_actor_obs, num_actions, num_envs=self.env.num_envs, **self.actor_cfg + ) else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 6c863adb..f17e3691 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -4,7 +4,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner from learning.algorithms import PPO # noqa: F401 -from learning.modules import ActorCritic, Actor, Critic, SmoothActor, PinkActor +from learning.modules import ActorCritic, Actor, Critic, SmoothActor, ColoredActor logger = Logger() @@ -24,10 +24,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - if self.actor_cfg["smooth_exploration"]: + if self.actor_cfg["exploration"]["type"] == "smooth": actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) - elif self.actor_cfg["pink_exploration"]: - actor = PinkActor(num_actor_obs, num_actions, **self.actor_cfg) + elif self.actor_cfg["exploration"]["type"] == "colored": + actor = ColoredActor(num_actor_obs, num_actions, **self.actor_cfg) else: actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) @@ -48,7 +48,7 @@ def learn(self): self.save() # * Initialize smooth exploration matrices - if self.actor_cfg["smooth_exploration"]: + if self.actor_cfg["exploration"]["type"] == "smooth": self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) logger.tic("runtime") @@ -59,8 +59,11 @@ def learn(self): with torch.inference_mode(): for i in range(self.num_steps_per_env): # * Re-sample noise matrix for smooth exploration - sample_freq = self.actor_cfg["exploration_sample_freq"] - if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + sample_freq = self.actor_cfg["exploration"]["sample_freq"] + if ( + self.actor_cfg["exploration"]["type"] == "smooth" + and i % sample_freq == 0 + ): self.alg.actor_critic.actor.sample_weights( batch_size=self.env.num_envs ) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index fc754e17..34600ae8 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -33,7 +33,7 @@ def learn(self, states_to_log_dict=None): self.save() # * Initialize smooth exploration matrices - if self.actor_cfg["smooth_exploration"]: + if self.actor_cfg["exploration"]["type"] == "smooth": self.alg.actor.sample_weights(batch_size=self.env.num_envs) # * start up storage @@ -76,15 +76,18 @@ def learn(self, states_to_log_dict=None): # * Simulate environment and log states if states_to_log_dict is not None: it_idx = self.it - 1 - if it_idx % 10 == 0: + if it_idx % 50 == 0: self.sim_and_log_states(states_to_log_dict, it_idx) # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): # * Re-sample noise matrix for smooth exploration - sample_freq = self.actor_cfg["exploration_sample_freq"] - if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + sample_freq = self.actor_cfg["exploration"]["sample_freq"] + if ( + self.actor_cfg["exploration"]["type"] == "smooth" + and i % sample_freq == 0 + ): self.alg.actor.sample_weights(batch_size=self.env.num_envs) actions = self.alg.act(actor_obs, critic_obs) @@ -167,7 +170,9 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) @@ -213,30 +218,31 @@ def sim_and_log_states(self, states_to_log_dict, it_idx): # Simulate environment for as many steps as expected in the dict. # Log states to the dict, as well as whether the env terminated. steps = states_to_log_dict["terminated"].shape[2] - actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) with torch.inference_mode(): for i in range(steps): - sample_freq = self.policy_cfg["exploration_sample_freq"] - if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: - self.alg.actor_critic.actor.sample_weights( - batch_size=self.env.num_envs - ) + sample_freq = self.actor_cfg["exploration"]["sample_freq"] + if ( + self.actor_cfg["exploration"]["type"] == "smooth" + and i % sample_freq == 0 + ): + self.alg.actor.sample_weights(batch_size=self.env.num_envs) actions = self.alg.act(actor_obs, critic_obs) self.set_actions( - self.policy_cfg["actions"], + self.actor_cfg["actions"], actions, - self.policy_cfg["disable_actions"], + self.actor_cfg["disable_actions"], ) self.env.step() actor_obs = self.get_noisy_obs( - self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) # Log states (just for the first env) terminated = self.get_terminated()[0] diff --git a/scripts/log_play.py b/scripts/log_play.py index f7fd68e8..76a31f67 100644 --- a/scripts/log_play.py +++ b/scripts/log_play.py @@ -68,7 +68,7 @@ def play(env, runner, train_cfg): log_file_path = os.path.join( LEGGED_GYM_ROOT_DIR, "gym", - "smooth_exploration", + "exploration_analysis", "data_play", protocol_name + ".npz", ) diff --git a/scripts/log_train.py b/scripts/log_train.py index 37fca0da..8136fa10 100644 --- a/scripts/log_train.py +++ b/scripts/log_train.py @@ -1,6 +1,7 @@ from gym.envs import __init__ # noqa: F401 from gym.utils import get_args, task_registry, randomize_episode_counters -from gym.utils.logging_and_saving import wandb_singleton + +# from gym.utils.logging_and_saving import wandb_singleton from gym.utils.logging_and_saving import local_code_save_helper from gym import LEGGED_GYM_ROOT_DIR @@ -9,7 +10,7 @@ import os import numpy as np -TRAIN_ITERATIONS = 100 +TRAIN_ITERATIONS = 500 ROLLOUT_TIMESTEPS = 1000 @@ -49,11 +50,11 @@ def create_logging_dict(runner): def setup(): args = get_args() - wandb_helper = wandb_singleton.WandbSingleton() + # wandb_helper = wandb_singleton.WandbSingleton() env_cfg, train_cfg = task_registry.create_cfgs(args) task_registry.make_gym_and_sim() - wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) + # wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) env = task_registry.make_env(name=args.task, env_cfg=env_cfg) randomize_episode_counters(env) @@ -72,7 +73,7 @@ def train(train_cfg, policy_runner): log_file_path = os.path.join( LEGGED_GYM_ROOT_DIR, "gym", - "smooth_exploration", + "exploration_analysis", "data_train", protocol_name + ".npz", ) diff --git a/scripts/sweep_configs/sweep_colored_exploration.json b/scripts/sweep_configs/sweep_colored_exploration.json new file mode 100644 index 00000000..78b18d77 --- /dev/null +++ b/scripts/sweep_configs/sweep_colored_exploration.json @@ -0,0 +1,16 @@ +{ + "method": "grid", + "metric": { + "name": "rewards/total_rewards", + "goal": "maximize" + }, + "run_cap": 10, + "parameters": { + "train_cfg.actor.exploration.beta":{ + "values": [0.5, 1, 1.5] + }, + "train_cfg.actor.exploration.log_std_init":{ + "values": [-0.5, 0, 0.5] + } + } +} \ No newline at end of file