From 0ccbf4612060fb92b52a46145678230650137d24 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 15 Jul 2024 15:16:12 -0400 Subject: [PATCH 1/5] SE re-implemented --- gym/envs/mini_cheetah/mini_cheetah_ref.py | 2 + .../mini_cheetah/mini_cheetah_ref_config.py | 34 ++++++- learning/algorithms/SE.py | 48 ++++++++++ learning/modules/state_estimator.py | 14 ++- learning/runners/BaseRunner.py | 9 ++ learning/runners/on_policy_runner.py | 89 ++++++++++++++++--- 6 files changed, 181 insertions(+), 15 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 57e1f0e7..9cf75fb2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -14,6 +14,8 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): self.leg_ref = 3 * to_torch(pd.read_csv(csv_path).to_numpy(), device=sim_device) self.omega = 2 * torch.pi * cfg.control.gait_freq super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + self.se_base_height = torch.zeros_like(self.base_height) + self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel) def _init_buffers(self): super()._init_buffers() diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 74e60d59..17c60c3a 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -62,7 +62,17 @@ class reward_settings(MiniCheetahCfg.reward_settings): tracking_sigma = 0.25 class scaling(MiniCheetahCfg.scaling): - pass + base_ang_vel = 0.3 + base_lin_vel = BASE_HEIGHT_REF + se_base_lin_vel = base_lin_vel + dof_vel = 4 * [2.0, 2.0, 4.0] + base_height = 0.3 + se_base_height = base_height + dof_pos = 4 * [0.2, 0.3, 0.3] + dof_pos_obs = dof_pos + dof_pos_target = 4 * [0.2, 0.3, 0.3] + tau_ff = 4 * [18, 18, 28] + commands = [3, 1, 3] class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): @@ -76,6 +86,8 @@ class actor(MiniCheetahRunnerCfg.actor): smooth_exploration = False exploration_sample_freq = 16 obs = [ + "se_base_height", + "se_base_lin_vel", "base_ang_vel", "projected_gravity", "commands", @@ -139,6 +151,26 @@ class weights: class termination_weight: termination = 0.15 + class state_estimator: + class network: + hidden_dims = [128, 128] + activation = "elu" + dropouts = None + + obs = [ + "base_ang_vel", + "projected_gravity", + "dof_pos_obs", + "dof_vel", + "phase_obs", + ] + targets = ["base_height", "base_lin_vel"] + write_to = ["se_base_height", "se_base_lin_vel"] + batch_size = 2**15 + max_gradient_steps = 10 + normalize_obs = True + learning_rate = 1e-4 + class algorithm(MiniCheetahRunnerCfg.algorithm): pass diff --git a/learning/algorithms/SE.py b/learning/algorithms/SE.py index 99dfb75f..059a5df5 100644 --- a/learning/algorithms/SE.py +++ b/learning/algorithms/SE.py @@ -3,9 +3,56 @@ from learning.modules import StateEstimatorNN from learning.storage import SERolloutStorage +from learning.utils import create_uniform_generator class StateEstimator: + def __init__( + self, + state_estimator, + normalize_obs=True, + batch_size=2**15, + max_gradient_steps=10, + learning_rate=1e-3, + device="cpu", + **kwargs, + ): + self.device = device + + self.network = state_estimator.to(self.device) + + self.batch_size = batch_size + self.max_gradient_steps = max_gradient_steps + + self.learning_rate = learning_rate + self.mean_loss = 0.0 + self.optimizer = optim.Adam(self.network.parameters(), lr=learning_rate) + + def update(self, data): + self.mean_loss = 0 + counter = 0 + generator = create_uniform_generator( + data, self.batch_size, self.max_gradient_steps + ) + for batch in generator: + loss = nn.functional.mse_loss( + self.network.evaluate(batch["SE_obs"]), batch["SE_targets"] + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.mean_loss += loss.item() + counter += 1 + self.mean_loss /= counter + + def estimate(self, obs): + return self.network.evaluate(obs) + + def export(self, path): + self.network.export(path) + + +class OldStateEstimator: """This class provides a learned state estimator. This is trained with supervised learning, using only on-policy data collected in a rollout storage. @@ -40,6 +87,7 @@ def __init__( self.state_estimator.to(self.device) self.optimizer = optim.Adam(self.state_estimator.parameters(), lr=learning_rate) self.SE_loss_fn = nn.MSELoss() + self.SE_loss = 0.0 def init_storage(self, num_envs, num_transitions_per_env, obs_shape, se_shape): self.storage = SERolloutStorage( diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py index f8cfe221..26a606f7 100644 --- a/learning/modules/state_estimator.py +++ b/learning/modules/state_estimator.py @@ -1,6 +1,5 @@ import torch.nn as nn -from .utils import create_MLP -from .utils import export_network +from .utils import create_MLP, export_network, RunningMeanStd class StateEstimatorNN(nn.Module): @@ -21,6 +20,7 @@ def __init__( hidden_dims=[256, 128], activation="elu", dropouts=None, + normalize_obs=True, **kwargs, ): if kwargs: @@ -30,13 +30,19 @@ def __init__( ) super().__init__() + self._normalize_obs = normalize_obs + if self._normalize_obs: + self.obs_rms = RunningMeanStd(num_inputs) + self.num_inputs = num_inputs self.num_outputs = num_outputs self.NN = create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts) print(f"State Estimator MLP: {self.NN}") - def evaluate(self, observations): - return self.NN(observations) + def evaluate(self, obs): + if self._normalize_obs: + obs = self.obs_rms(obs) + return self.NN(obs) def export(self, path): export_network(self.NN, "state_estimator", path, self.num_inputs) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 6819f4f4..ac23cdb1 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -34,10 +34,19 @@ def _set_up_alg(self): def parse_train_cfg(self, train_cfg): self.cfg = train_cfg["runner"] + del train_cfg["runner"] + self.alg_cfg = train_cfg["algorithm"] + del train_cfg["algorithm"] + remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"]) self.actor_cfg = train_cfg["actor"] + del train_cfg["actor"] + self.critic_cfg = train_cfg["critic"] + del train_cfg["critic"] + + self.train_cfg = train_cfg def init_storage(self): raise NotImplementedError diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index f601ce71..9381b09f 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -7,8 +7,12 @@ from .BaseRunner import BaseRunner from learning.storage import DictStorage +from learning.algorithms import StateEstimator +from learning.modules import StateEstimatorNN + logger = Logger() storage = DictStorage() +SEStorage = DictStorage() class OnPolicyRunner(BaseRunner): @@ -21,6 +25,21 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) + def _set_up_alg(self): + super()._set_up_alg() + if "state_estimator" in self.train_cfg.keys(): + self.train_SE = True + cfg = self.train_cfg["state_estimator"] + # todo refactor cfg + state_estimator_network = StateEstimatorNN( + self.get_obs_size(cfg["obs"]), + self.get_obs_size(cfg["targets"]), + **cfg["network"], + ) + self.SE = StateEstimator(state_estimator_network, device=self.device, **cfg) + else: + self.train_SE = False + def learn(self, states_to_log_dict=None): self.set_up_logger() @@ -45,6 +64,17 @@ def learn(self, states_to_log_dict=None): "dones": self.get_timed_out(), } ) + + if self.train_SE: + transition.update( + { + "SE_obs": self.get_obs(self.train_cfg["state_estimator"]["obs"]), + "SE_targets": self.get_obs( + self.train_cfg["state_estimator"]["targets"] + ), + } + ) + storage.initialize( transition, self.env.num_envs, @@ -87,6 +117,23 @@ def learn(self, states_to_log_dict=None): self.env.step() + if self.train_SE: + # use the state-estimator + SE_obs = self.get_obs(self.train_cfg["state_estimator"]["obs"]) + self.env.set_states( + self.train_cfg["state_estimator"]["write_to"], + self.SE.estimate(SE_obs), + ) + # update storage for training estimator + transition.update( + { + "SE_obs": SE_obs, + "SE_targets": self.get_obs( + self.train_cfg["state_estimator"]["targets"] + ), + } + ) + actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) @@ -109,6 +156,7 @@ def learn(self, states_to_log_dict=None): "dones": dones, } ) + storage.add_transitions(transition) logger.log_rewards(rewards_dict) @@ -118,6 +166,7 @@ def learn(self, states_to_log_dict=None): logger.tic("learning") self.alg.update(storage.data) + self.SE.update(storage.data) storage.clear() logger.toc("learning") logger.log_all_categories() @@ -173,21 +222,31 @@ def set_up_logger(self): ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + if self.train_SE: + logger.register_category("state_estimator", self.SE, ["mean_loss"]) + logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) def save(self): os.makedirs(self.log_dir, exist_ok=True) path = os.path.join(self.log_dir, "model_{}.pt".format(self.it)) - torch.save( - { - "actor_state_dict": self.alg.actor.state_dict(), - "critic_state_dict": self.alg.critic.state_dict(), - "optimizer_state_dict": self.alg.optimizer.state_dict(), - "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), - "iter": self.it, - }, - path, - ) + save_dict = { + "actor_state_dict": self.alg.actor.state_dict(), + "critic_state_dict": self.alg.critic.state_dict(), + "optimizer_state_dict": self.alg.optimizer.state_dict(), + "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(), + "iter": self.it, + } + + if self.train_SE: + save_dict.update( + { + "SE_state_dict": self.SE.network.state_dict(), + "SE_optimizer_state_dict": self.SE.optimizer.state_dict(), + } + ) + + torch.save(save_dict, path) def load(self, path, load_optimizer=True): loaded_dict = torch.load(path) @@ -199,17 +258,27 @@ def load(self, path, load_optimizer=True): loaded_dict["critic_optimizer_state_dict"] ) self.it = loaded_dict["iter"] + if self.train_SE: + self.SE.network.load_state_dict(loaded_dict["SE_state_dict"]) + self.SE.optimizer.load_state_dict(loaded_dict["SE_optimizer_state_dict"]) def switch_to_eval(self): self.alg.actor.eval() self.alg.critic.eval() def get_inference_actions(self): + if self.train_SE: + SE_obs = self.get_obs(self.train_cfg["state_estimator"]["obs"]) + self.env.set_states( + self.train_cfg["state_estimator"]["write_to"], + self.SE.estimate(SE_obs), + ) obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"]) return self.alg.actor.act_inference(obs) def export(self, path): self.alg.actor.export(path) + self.SE.export(path) def sim_and_log_states(self, states_to_log_dict, it_idx): # Simulate environment for as many steps as expected in the dict. From a09c33574bdc87e362e75004be3eb83f7e7d1e2b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 16 Jul 2024 11:17:57 -0400 Subject: [PATCH 2/5] tweaks --- gym/envs/mini_cheetah/mini_cheetah_ref.py | 10 +++++++--- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 12 ++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 9cf75fb2..5bbbaee9 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -12,10 +12,8 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): LEGGED_GYM_ROOT_DIR=LEGGED_GYM_ROOT_DIR ) self.leg_ref = 3 * to_torch(pd.read_csv(csv_path).to_numpy(), device=sim_device) - self.omega = 2 * torch.pi * cfg.control.gait_freq + # self.omega = 2 * torch.pi * cfg.control.gait_freq super().__init__(gym, sim, cfg, sim_params, sim_device, headless) - self.se_base_height = torch.zeros_like(self.base_height) - self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel) def _init_buffers(self): super()._init_buffers() @@ -25,6 +23,9 @@ def _init_buffers(self): self.phase_obs = torch.zeros( self.num_envs, 2, dtype=torch.float, device=self.device ) + self.omega = torch.ones(self.num_envs, 1, device=self.device) + self.se_base_height = torch.zeros_like(self.base_height) + self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel) def _reset_system(self, env_ids): super()._reset_system(env_ids) @@ -32,6 +33,9 @@ def _reset_system(self, env_ids): 0, torch.pi, shape=self.phase[env_ids].shape, device=self.device ) + def _reset_gait_frequencies(self, env_ids): + self.omega = torch_rand_float() + def _post_physx_step(self): super()._post_physx_step() self.phase = ( diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 17c60c3a..4680c78d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -25,7 +25,7 @@ class control(MiniCheetahCfg.control): # * PD Drive parameters: stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} - gait_freq = 3.0 + gait_freq = [0.0, 3.0] ctrl_frequency = 100 desired_sim_frequency = 500 @@ -95,7 +95,7 @@ class actor(MiniCheetahRunnerCfg.actor): "dof_vel", "phase_obs", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -125,7 +125,7 @@ class critic(MiniCheetahRunnerCfg.critic): "phase_obs", "dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: @@ -144,7 +144,7 @@ class weights: dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 - reference_traj = 1.5 + reference_traj = 0.0 swing_grf = 1.5 stance_grf = 1.5 @@ -168,7 +168,7 @@ class network: write_to = ["se_base_height", "se_base_lin_vel"] batch_size = 2**15 max_gradient_steps = 10 - normalize_obs = True + normalize_obs = False learning_rate = 1e-4 class algorithm(MiniCheetahRunnerCfg.algorithm): @@ -177,6 +177,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 1000 # number of policy updates + max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 32 # deprecate From 937e001a06280aa02943441d09970ab0753165a1 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 16 Jul 2024 14:47:58 -0400 Subject: [PATCH 3/5] trot reward based on osc (smoothed with sin) --- gym/envs/mini_cheetah/mini_cheetah.py | 2 ++ .../mini_cheetah/mini_cheetah_osc_config.py | 26 +++++++++++++++++-- gym/envs/mini_cheetah/mini_cheetah_ref.py | 26 ++++++++++++++++--- .../mini_cheetah/mini_cheetah_ref_config.py | 11 ++++---- 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah.py b/gym/envs/mini_cheetah/mini_cheetah.py index bea90140..99cd5bc5 100644 --- a/gym/envs/mini_cheetah/mini_cheetah.py +++ b/gym/envs/mini_cheetah/mini_cheetah.py @@ -6,6 +6,8 @@ class MiniCheetah(LeggedRobot): def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + self.se_base_height = torch.zeros_like(self.base_height) + self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel) def _reward_lin_vel_z(self): """Penalize z axis base linear velocity with squared exp""" diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index 9d686f8e..f994ee88 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -74,7 +74,7 @@ class osc: coupling = 1 # 0.02 osc_bool = False grf_bool = False - randomize_osc_params = False + randomize_osc_params = True grf_threshold = 0.1 # 20. # Normalized to body weight omega_range = [1.0, 4.0] # [0.0, 10.] coupling_range = [ @@ -170,6 +170,8 @@ class policy: smooth_exploration = False obs = [ + "base_height", + "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", @@ -226,6 +228,26 @@ class weights: class termination_weight: termination = 15.0 / 100.0 + class state_estimator: + class network: + hidden_dims = [128, 128] + activation = "elu" + dropouts = None + + obs = [ + "base_ang_vel", + "projected_gravity", + "dof_pos_obs", + "dof_vel", + "oscillator_obs", + ] + targets = ["base_height", "base_lin_vel"] + write_to = ["se_base_height", "se_base_lin_vel"] + batch_size = 2**15 + max_gradient_steps = 10 + normalize_obs = True + learning_rate = 1e-4 + class algorithm(MiniCheetahRunnerCfg.algorithm): # training params value_loss_coef = 1.0 @@ -245,6 +267,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "FullSend" - max_iterations = 500 # number of policy updates + max_iterations = 1000 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 5bbbaee9..2d141216 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -29,18 +29,23 @@ def _init_buffers(self): def _reset_system(self, env_ids): super()._reset_system(env_ids) + self._reset_gait_frequencies(env_ids) self.phase[env_ids] = torch_rand_float( 0, torch.pi, shape=self.phase[env_ids].shape, device=self.device ) def _reset_gait_frequencies(self, env_ids): - self.omega = torch_rand_float() + self.omega[env_ids, 0] = torch_rand_float( + self.cfg.control.gait_freq[0], + self.cfg.control.gait_freq[1], + (len(env_ids), 1), + device=self.device, + ).squeeze(1) def _post_physx_step(self): super()._post_physx_step() - self.phase = ( - self.phase + self.dt * self.omega / self.cfg.control.decimation - ).fmod(2 * torch.pi) + self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation + self.phase.fmod(2 * torch.pi) def _post_decimation_step(self): super()._post_decimation_step() @@ -62,6 +67,19 @@ def _switch(self): -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 ) + def _reward_trot(self): + off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi) + phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1) + grf = self._compute_grf() + return (grf * torch.sin(phases)).mean(dim=1) * (1 - self._switch()) + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / 80.0, 1.0) + else: + return grf + def _reward_swing_grf(self): """Reward non-zero grf during swing (0 to pi)""" in_contact = torch.gt( diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 4680c78d..ab8651d8 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -44,7 +44,7 @@ class asset(MiniCheetahCfg.asset): + "mini_cheetah/urdf/mini_cheetah_simple.urdf" ) foot_name = "foot" - penalize_contacts_on = ["shank"] + penalize_contacts_on = ["shank", "thigh"] terminate_after_contacts_on = ["base", "thigh"] collapse_fixed_joints = False fix_base_link = False @@ -86,8 +86,8 @@ class actor(MiniCheetahRunnerCfg.actor): smooth_exploration = False exploration_sample_freq = 16 obs = [ - "se_base_height", - "se_base_lin_vel", + "base_height", + "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", @@ -145,8 +145,9 @@ class weights: feet_contact_forces = 0.0 dof_near_home = 0.0 reference_traj = 0.0 - swing_grf = 1.5 - stance_grf = 1.5 + # swing_grf = 1.5 + # stance_grf = 1.5 + trot = 1.5 class termination_weight: termination = 0.15 From 0ca782ca7f728de72e98f67c0e89285a2f95049a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 18 Jul 2024 14:12:20 -0400 Subject: [PATCH 4/5] several tweaks, learns reasonably well on ref --- gym/envs/mini_cheetah/mini_cheetah_config.py | 21 ++++--- gym/envs/mini_cheetah/mini_cheetah_osc.py | 7 --- .../mini_cheetah/mini_cheetah_osc_config.py | 4 +- gym/envs/mini_cheetah/mini_cheetah_ref.py | 62 ++++++++++++++----- .../mini_cheetah/mini_cheetah_ref_config.py | 52 +++++++++++++--- learning/utils/logger/Logger.py | 5 +- scripts/play.py | 5 +- 7 files changed, 109 insertions(+), 47 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 157c6031..12e4a11f 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -36,21 +36,21 @@ class init_state(LeggedRobotCfg.init_state): # * initialization for random range setup dof_pos_range = { "haa": [-0.01, 0.01], - "hfe": [-0.785398, -0.785398], - "kfe": [1.596976, 1.596976], + "hfe": [-0.885398, -0.585398], + "kfe": [1.396976, 1.796976], } - dof_vel_range = {"haa": [0.0, 0.0], "hfe": [0.0, 0.0], "kfe": [0.0, 0.0]} + dof_vel_range = {"haa": [0.0, 0.0], "hfe": [-0.2, 0.2], "kfe": [-0.2, 0.2]} root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y [0.35, 0.35], # z - [0.0, 0.0], # roll - [0.0, 0.0], # pitch - [0.0, 0.0], # yaw + [-0.5, 0.5], # roll + [-0.5, 0.5], # pitch + [-0.5, 0.5], # yaw ] root_vel_range = [ - [-0.5, 2.0], # x - [0.0, 0.0], # y + [-1.0, 1.0], # x + [-0.5, 0.5], # y [-0.05, 0.05], # z [0.0, 0.0], # roll [0.0, 0.0], # pitch @@ -67,9 +67,10 @@ class control(LeggedRobotCfg.control): class commands: # * time before command are changed[s] resampling_time = 3.0 + var = 1.0 class ranges: - lin_vel_x = [-2.0, 3.0] # min max [m/s] + lin_vel_x = [-1.0, 0.0, 1.0, 3.0] lin_vel_y = 1.0 # max [m/s] yaw_vel = 3 # max [rad/s] @@ -82,7 +83,7 @@ class push_robots: class domain_rand: randomize_friction = True friction_range = [0.5, 1.0] - randomize_base_mass = False + randomize_base_mass = True added_mass_range = [-1.0, 1.0] class asset(LeggedRobotCfg.asset): diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py index acec0b8a..3f65bca4 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc.py @@ -266,13 +266,6 @@ def _compute_grf(self, grf_norm=True): else: return grf - def _switch(self): - c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) - / self.cfg.reward_settings.switch_scale - ) - def _reward_cursorial(self): # penalize the abad joints being away from 0 return -torch.mean( diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index f994ee88..c432fe59 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -170,8 +170,8 @@ class policy: smooth_exploration = False obs = [ - "base_height", - "base_lin_vel", + "se_base_height", + "se_base_lin_vel", "base_ang_vel", "projected_gravity", "commands", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 2d141216..75f65fca 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -42,6 +42,45 @@ def _reset_gait_frequencies(self, env_ids): device=self.device, ).squeeze(1) + def _resample_commands(self, env_ids): + """Randommly select commands of some environments + + Args: + env_ids (List[int]): Environments ids for which new commands are needed + """ + if len(env_ids) == 0: + return + super()._resample_commands(env_ids) + possible_commands = torch.tensor( + self.command_ranges["lin_vel_x"], device=self.device + ) + self.commands[env_ids, 0:1] = possible_commands[ + torch.randint( + 0, len(possible_commands), (len(env_ids), 1), device=self.device + ) + ] + # add some gaussian noise to the commands + self.commands[env_ids, 0:1] += ( + torch.randn((len(env_ids), 1), device=self.device) * self.cfg.commands.var + ) + + # if 0 in self.cfg.commands.ranges.lin_vel_x: + # * with 20% chance, reset to 0 commands except for forward + self.commands[env_ids, 1:] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.8 + ).unsqueeze(1) + # * with 20% chance, reset to 0 commands except for rotation + self.commands[env_ids, :2] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.8 + ).unsqueeze(1) + # * with 10% chance, reset to 0 + # self.commands[env_ids, :] *= ( + # torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + # < 0.9 + # ).unsqueeze(1) + def _post_physx_step(self): super()._post_physx_step() self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation @@ -53,25 +92,18 @@ def _post_decimation_step(self): (torch.sin(self.phase), torch.cos(self.phase)), dim=1 ) - def _resample_commands(self, env_ids): - super()._resample_commands(env_ids) - # * with 10% chance, reset to 0 commands - rand_ids = torch_rand_float( - 0, 1, (len(env_ids), 1), device=self.device - ).squeeze(1) - self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1) - def _switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + / self.cfg.reward_settings.switch_scale ) def _reward_trot(self): off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi) phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1) grf = self._compute_grf() - return (grf * torch.sin(phases)).mean(dim=1) * (1 - self._switch()) + return (grf * torch.sin(phases)).mean(dim=1) # * (1 - self._switch()) def _compute_grf(self, grf_norm=True): grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) @@ -88,7 +120,7 @@ def _reward_swing_grf(self): ) ph_off = torch.lt(self.phase, torch.pi) rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return -torch.sum(rew.float(), dim=1) # * (1 - self._switch()) def _reward_stance_grf(self): """Reward non-zero grf during stance (pi to 2pi)""" @@ -99,7 +131,7 @@ def _reward_stance_grf(self): ph_off = torch.gt(self.phase, torch.pi) # should this be in swing? rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return torch.sum(rew.float(), dim=1) # * (1 - self._switch()) def _reward_reference_traj(self): """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE""" @@ -108,7 +140,7 @@ def _reward_reference_traj(self): error /= self.scales["dof_pos"] reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1) # * only when commanded velocity is higher - return reward * (1 - self._switch()) + return reward # * (1 - self._switch()) def _get_ref(self): leg_frame = torch.zeros_like(self.torques) @@ -136,10 +168,10 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return rew_vel + rew_pos - rew_base_vel # * self._switch() def _reward_tracking_lin_vel(self): """Tracking linear velocity commands (xy axes)""" # just use lin_vel? reward = super()._reward_tracking_lin_vel() - return reward * (1 - self._switch()) + return reward # * (1 - self._switch()) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index ab8651d8..41d031d9 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -20,6 +20,7 @@ class init_state(MiniCheetahCfg.init_state): "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "mini_cheetah/trajectories/single_leg.csv" ) + reset_mode = "reset_to_range" class control(MiniCheetahCfg.control): # * PD Drive parameters: @@ -30,7 +31,14 @@ class control(MiniCheetahCfg.control): desired_sim_frequency = 500 class commands(MiniCheetahCfg.commands): - pass + # * time before command are changed[s] + resampling_time = 3.0 + var = 1.0 + + class ranges: + lin_vel_x = [-1.0, 0.0, 1.0, 3.0] + lin_vel_y = 1.0 # max [m/s] + yaw_vel = 3 # max [rad/s] class push_robots(MiniCheetahCfg.push_robots): pass @@ -45,7 +53,7 @@ class asset(MiniCheetahCfg.asset): ) foot_name = "foot" penalize_contacts_on = ["shank", "thigh"] - terminate_after_contacts_on = ["base", "thigh"] + terminate_after_contacts_on = ["base"] collapse_fixed_joints = False fix_base_link = False self_collisions = 1 @@ -58,8 +66,9 @@ class reward_settings(MiniCheetahCfg.reward_settings): soft_dof_vel_limit = 0.9 soft_torque_limit = 0.9 max_contact_force = 600.0 - base_height_target = 0.3 + base_height_target = 0.36 tracking_sigma = 0.25 + switch_scale = 0.5 class scaling(MiniCheetahCfg.scaling): base_ang_vel = 0.3 @@ -86,14 +95,15 @@ class actor(MiniCheetahRunnerCfg.actor): smooth_exploration = False exploration_sample_freq = 16 obs = [ - "base_height", - "base_lin_vel", + # "base_height", + # "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", "dof_pos_obs", "dof_vel", "phase_obs", + "omega", ] normalize_obs = False @@ -101,7 +111,7 @@ class actor(MiniCheetahRunnerCfg.actor): disable_actions = False class noise: - scale = 1.0 + scale = 0.0 dof_pos_obs = 0.01 base_ang_vel = 0.01 dof_pos = 0.005 @@ -123,6 +133,7 @@ class critic(MiniCheetahRunnerCfg.critic): "dof_pos_obs", "dof_vel", "phase_obs", + "omega", "dof_pos_target", ] normalize_obs = False @@ -132,7 +143,7 @@ class weights: tracking_lin_vel = 4.0 tracking_ang_vel = 2.0 lin_vel_z = 0.0 - ang_vel_xy = 0.01 + ang_vel_xy = 0.0 orientation = 1.0 torques = 5.0e-7 dof_vel = 0.0 @@ -141,13 +152,13 @@ class weights: action_rate = 0.01 action_rate2 = 0.001 stand_still = 0.0 - dof_pos_limits = 0.0 + dof_pos_limits = 0.5 feet_contact_forces = 0.0 dof_near_home = 0.0 reference_traj = 0.0 # swing_grf = 1.5 # stance_grf = 1.5 - trot = 1.5 + trot = 3.0 class termination_weight: termination = 0.15 @@ -173,7 +184,28 @@ class network: learning_rate = 1e-4 class algorithm(MiniCheetahRunnerCfg.algorithm): - pass + # both + gamma = 0.99 + lam = 0.95 + GAE_bootstrap_horizon = 2.0 + discount_horizon = 1.0 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + + clip_param = 0.2 + learning_rate = 1.0e-5 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [1e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" diff --git a/learning/utils/logger/Logger.py b/learning/utils/logger/Logger.py index bcaa56a2..90642e48 100644 --- a/learning/utils/logger/Logger.py +++ b/learning/utils/logger/Logger.py @@ -79,7 +79,10 @@ def format_log_entry(key, v=None, append=""): if v is None: log_string += f"{key:>{pad}}: {append}\n" else: - log_string += f"{key:>{pad}}: {v:.2f} {append}\n" + if abs(v) > 1e-2: + log_string += f"{key:>{pad}}: {v:.2f} {append}\n" + else: + log_string += f"{key:>{pad}}: {v:.2e} {append}\n" def separator(subtitle="", marker="-"): nonlocal log_string diff --git a/scripts/play.py b/scripts/play.py index 37882f7e..d3e229cb 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -13,11 +13,12 @@ def setup(args): if hasattr(env_cfg, "push_robots"): env_cfg.push_robots.toggle = False if hasattr(env_cfg, "commands"): - env_cfg.commands.resampling_time = 9999 + env_cfg.commands.resampling_time = 9999.0 env_cfg.env.episode_length_s = 50 env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() - env_cfg.init_state.reset_mode = "reset_to_range" + env_cfg.init_state.reset_mode = "reset_to_basic" + # env_cfg.init_state.reset_mode = "reset_to_range" env = task_registry.make_env(args.task, env_cfg) train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False From ee0220baab2885bdbd530c85008a3787a527a806 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 22 Jul 2024 17:47:03 -0400 Subject: [PATCH 5/5] grf added to internal state of mini_cheetah_ref --- gym/envs/mini_cheetah/mini_cheetah_ref.py | 18 ++++++++++++++++-- .../mini_cheetah/mini_cheetah_ref_config.py | 10 ++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 75f65fca..908e4ef1 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -24,6 +24,13 @@ def _init_buffers(self): self.num_envs, 2, dtype=torch.float, device=self.device ) self.omega = torch.ones(self.num_envs, 1, device=self.device) + + # self.previous_base_lin_vel = torch.zeros_like(self.base_lin_vel) + # self.base_lin_acc = torch.zeros_like(self.base_lin_vel) + + self.grf = self._compute_grf() + self.se_grf = self._compute_grf() + self.se_base_height = torch.zeros_like(self.base_height) self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel) @@ -33,6 +40,7 @@ def _reset_system(self, env_ids): self.phase[env_ids] = torch_rand_float( 0, torch.pi, shape=self.phase[env_ids].shape, device=self.device ) + self.se_base_lin_vel[env_ids] = self.base_lin_vel[env_ids] def _reset_gait_frequencies(self, env_ids): self.omega[env_ids, 0] = torch_rand_float( @@ -86,11 +94,17 @@ def _post_physx_step(self): self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation self.phase.fmod(2 * torch.pi) + def _pre_decimation_step(self): + super()._pre_decimation_step + # self.previous_base_lin_vel = self.base_lin_vel.clone() + def _post_decimation_step(self): super()._post_decimation_step() self.phase_obs = torch.cat( (torch.sin(self.phase), torch.cos(self.phase)), dim=1 ) + self.grf = self._compute_grf() + # self.base_lin_acc = (self.base_lin_vel - self.previous_base_lin_vel) / self.dt def _switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) @@ -102,8 +116,8 @@ def _switch(self): def _reward_trot(self): off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi) phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1) - grf = self._compute_grf() - return (grf * torch.sin(phases)).mean(dim=1) # * (1 - self._switch()) + # grf = self._compute_grf() + return (self.grf * torch.sin(phases)).mean(dim=1) # * (1 - self._switch()) def _compute_grf(self, grf_norm=True): grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41d031d9..9a594862 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -82,6 +82,7 @@ class scaling(MiniCheetahCfg.scaling): dof_pos_target = 4 * [0.2, 0.3, 0.3] tau_ff = 4 * [18, 18, 28] commands = [3, 1, 3] + torques = 4 * [9.0, 9.0, 14.0] class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): @@ -91,7 +92,7 @@ class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): class actor(MiniCheetahRunnerCfg.actor): hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" smooth_exploration = False exploration_sample_freq = 16 obs = [ @@ -105,7 +106,7 @@ class actor(MiniCheetahRunnerCfg.actor): "phase_obs", "omega", ] - normalize_obs = False + normalize_obs = True actions = ["dof_pos_target"] disable_actions = False @@ -174,10 +175,11 @@ class network: "projected_gravity", "dof_pos_obs", "dof_vel", + "torques", "phase_obs", ] - targets = ["base_height", "base_lin_vel"] - write_to = ["se_base_height", "se_base_lin_vel"] + targets = ["base_height", "base_lin_vel", "grf"] + write_to = ["se_base_height", "se_base_lin_vel", "se_grf"] batch_size = 2**15 max_gradient_steps = 10 normalize_obs = False