From 0ccbf4612060fb92b52a46145678230650137d24 Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Mon, 15 Jul 2024 15:16:12 -0400
Subject: [PATCH 1/5] SE re-implemented

---
 gym/envs/mini_cheetah/mini_cheetah_ref.py     |  2 +
 .../mini_cheetah/mini_cheetah_ref_config.py   | 34 ++++++-
 learning/algorithms/SE.py                     | 48 ++++++++++
 learning/modules/state_estimator.py           | 14 ++-
 learning/runners/BaseRunner.py                |  9 ++
 learning/runners/on_policy_runner.py          | 89 ++++++++++++++++---
 6 files changed, 181 insertions(+), 15 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 57e1f0e7..9cf75fb2 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -14,6 +14,8 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         self.leg_ref = 3 * to_torch(pd.read_csv(csv_path).to_numpy(), device=sim_device)
         self.omega = 2 * torch.pi * cfg.control.gait_freq
         super().__init__(gym, sim, cfg, sim_params, sim_device, headless)
+        self.se_base_height = torch.zeros_like(self.base_height)
+        self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
     def _init_buffers(self):
         super()._init_buffers()
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 74e60d59..17c60c3a 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -62,7 +62,17 @@ class reward_settings(MiniCheetahCfg.reward_settings):
         tracking_sigma = 0.25
 
     class scaling(MiniCheetahCfg.scaling):
-        pass
+        base_ang_vel = 0.3
+        base_lin_vel = BASE_HEIGHT_REF
+        se_base_lin_vel = base_lin_vel
+        dof_vel = 4 * [2.0, 2.0, 4.0]
+        base_height = 0.3
+        se_base_height = base_height
+        dof_pos = 4 * [0.2, 0.3, 0.3]
+        dof_pos_obs = dof_pos
+        dof_pos_target = 4 * [0.2, 0.3, 0.3]
+        tau_ff = 4 * [18, 18, 28]
+        commands = [3, 1, 3]
 
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
@@ -76,6 +86,8 @@ class actor(MiniCheetahRunnerCfg.actor):
         smooth_exploration = False
         exploration_sample_freq = 16
         obs = [
+            "se_base_height",
+            "se_base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -139,6 +151,26 @@ class weights:
             class termination_weight:
                 termination = 0.15
 
+    class state_estimator:
+        class network:
+            hidden_dims = [128, 128]
+            activation = "elu"
+            dropouts = None
+
+        obs = [
+            "base_ang_vel",
+            "projected_gravity",
+            "dof_pos_obs",
+            "dof_vel",
+            "phase_obs",
+        ]
+        targets = ["base_height", "base_lin_vel"]
+        write_to = ["se_base_height", "se_base_lin_vel"]
+        batch_size = 2**15
+        max_gradient_steps = 10
+        normalize_obs = True
+        learning_rate = 1e-4
+
     class algorithm(MiniCheetahRunnerCfg.algorithm):
         pass
 
diff --git a/learning/algorithms/SE.py b/learning/algorithms/SE.py
index 99dfb75f..059a5df5 100644
--- a/learning/algorithms/SE.py
+++ b/learning/algorithms/SE.py
@@ -3,9 +3,56 @@
 
 from learning.modules import StateEstimatorNN
 from learning.storage import SERolloutStorage
+from learning.utils import create_uniform_generator
 
 
 class StateEstimator:
+    def __init__(
+        self,
+        state_estimator,
+        normalize_obs=True,
+        batch_size=2**15,
+        max_gradient_steps=10,
+        learning_rate=1e-3,
+        device="cpu",
+        **kwargs,
+    ):
+        self.device = device
+
+        self.network = state_estimator.to(self.device)
+
+        self.batch_size = batch_size
+        self.max_gradient_steps = max_gradient_steps
+
+        self.learning_rate = learning_rate
+        self.mean_loss = 0.0
+        self.optimizer = optim.Adam(self.network.parameters(), lr=learning_rate)
+
+    def update(self, data):
+        self.mean_loss = 0
+        counter = 0
+        generator = create_uniform_generator(
+            data, self.batch_size, self.max_gradient_steps
+        )
+        for batch in generator:
+            loss = nn.functional.mse_loss(
+                self.network.evaluate(batch["SE_obs"]), batch["SE_targets"]
+            )
+            self.optimizer.zero_grad()
+            loss.backward()
+            self.optimizer.step()
+            self.mean_loss += loss.item()
+            counter += 1
+        self.mean_loss /= counter
+
+    def estimate(self, obs):
+        return self.network.evaluate(obs)
+
+    def export(self, path):
+        self.network.export(path)
+
+
+class OldStateEstimator:
     """This class provides a learned state estimator.
     This is trained with supervised learning, using only on-policy data
     collected in a rollout storage.
@@ -40,6 +87,7 @@ def __init__(
         self.state_estimator.to(self.device)
         self.optimizer = optim.Adam(self.state_estimator.parameters(), lr=learning_rate)
         self.SE_loss_fn = nn.MSELoss()
+        self.SE_loss = 0.0
 
     def init_storage(self, num_envs, num_transitions_per_env, obs_shape, se_shape):
         self.storage = SERolloutStorage(
diff --git a/learning/modules/state_estimator.py b/learning/modules/state_estimator.py
index f8cfe221..26a606f7 100644
--- a/learning/modules/state_estimator.py
+++ b/learning/modules/state_estimator.py
@@ -1,6 +1,5 @@
 import torch.nn as nn
-from .utils import create_MLP
-from .utils import export_network
+from .utils import create_MLP, export_network, RunningMeanStd
 
 
 class StateEstimatorNN(nn.Module):
@@ -21,6 +20,7 @@ def __init__(
         hidden_dims=[256, 128],
         activation="elu",
         dropouts=None,
+        normalize_obs=True,
         **kwargs,
     ):
         if kwargs:
@@ -30,13 +30,19 @@ def __init__(
             )
         super().__init__()
 
+        self._normalize_obs = normalize_obs
+        if self._normalize_obs:
+            self.obs_rms = RunningMeanStd(num_inputs)
+
         self.num_inputs = num_inputs
         self.num_outputs = num_outputs
         self.NN = create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts)
         print(f"State Estimator MLP: {self.NN}")
 
-    def evaluate(self, observations):
-        return self.NN(observations)
+    def evaluate(self, obs):
+        if self._normalize_obs:
+            obs = self.obs_rms(obs)
+        return self.NN(obs)
 
     def export(self, path):
         export_network(self.NN, "state_estimator", path, self.num_inputs)
diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py
index 6819f4f4..ac23cdb1 100644
--- a/learning/runners/BaseRunner.py
+++ b/learning/runners/BaseRunner.py
@@ -34,10 +34,19 @@ def _set_up_alg(self):
 
     def parse_train_cfg(self, train_cfg):
         self.cfg = train_cfg["runner"]
+        del train_cfg["runner"]
+
         self.alg_cfg = train_cfg["algorithm"]
+        del train_cfg["algorithm"]
+
         remove_zero_weighted_rewards(train_cfg["critic"]["reward"]["weights"])
         self.actor_cfg = train_cfg["actor"]
+        del train_cfg["actor"]
+
         self.critic_cfg = train_cfg["critic"]
+        del train_cfg["critic"]
+
+        self.train_cfg = train_cfg
 
     def init_storage(self):
         raise NotImplementedError
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index f601ce71..9381b09f 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -7,8 +7,12 @@
 from .BaseRunner import BaseRunner
 from learning.storage import DictStorage
 
+from learning.algorithms import StateEstimator
+from learning.modules import StateEstimatorNN
+
 logger = Logger()
 storage = DictStorage()
+SEStorage = DictStorage()
 
 
 class OnPolicyRunner(BaseRunner):
@@ -21,6 +25,21 @@ def __init__(self, env, train_cfg, device="cpu"):
             self.device,
         )
 
+    def _set_up_alg(self):
+        super()._set_up_alg()
+        if "state_estimator" in self.train_cfg.keys():
+            self.train_SE = True
+            cfg = self.train_cfg["state_estimator"]
+            # todo refactor cfg
+            state_estimator_network = StateEstimatorNN(
+                self.get_obs_size(cfg["obs"]),
+                self.get_obs_size(cfg["targets"]),
+                **cfg["network"],
+            )
+            self.SE = StateEstimator(state_estimator_network, device=self.device, **cfg)
+        else:
+            self.train_SE = False
+
     def learn(self, states_to_log_dict=None):
         self.set_up_logger()
 
@@ -45,6 +64,17 @@ def learn(self, states_to_log_dict=None):
                 "dones": self.get_timed_out(),
             }
         )
+
+        if self.train_SE:
+            transition.update(
+                {
+                    "SE_obs": self.get_obs(self.train_cfg["state_estimator"]["obs"]),
+                    "SE_targets": self.get_obs(
+                        self.train_cfg["state_estimator"]["targets"]
+                    ),
+                }
+            )
+
         storage.initialize(
             transition,
             self.env.num_envs,
@@ -87,6 +117,23 @@ def learn(self, states_to_log_dict=None):
 
                     self.env.step()
 
+                    if self.train_SE:
+                        # use the state-estimator
+                        SE_obs = self.get_obs(self.train_cfg["state_estimator"]["obs"])
+                        self.env.set_states(
+                            self.train_cfg["state_estimator"]["write_to"],
+                            self.SE.estimate(SE_obs),
+                        )
+                        # update storage for training estimator
+                        transition.update(
+                            {
+                                "SE_obs": SE_obs,
+                                "SE_targets": self.get_obs(
+                                    self.train_cfg["state_estimator"]["targets"]
+                                ),
+                            }
+                        )
+
                     actor_obs = self.get_noisy_obs(
                         self.actor_cfg["obs"], self.actor_cfg["noise"]
                     )
@@ -109,6 +156,7 @@ def learn(self, states_to_log_dict=None):
                             "dones": dones,
                         }
                     )
+
                     storage.add_transitions(transition)
 
                     logger.log_rewards(rewards_dict)
@@ -118,6 +166,7 @@ def learn(self, states_to_log_dict=None):
 
             logger.tic("learning")
             self.alg.update(storage.data)
+            self.SE.update(storage.data)
             storage.clear()
             logger.toc("learning")
             logger.log_all_categories()
@@ -173,21 +222,31 @@ def set_up_logger(self):
         )
         logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 
+        if self.train_SE:
+            logger.register_category("state_estimator", self.SE, ["mean_loss"])
+
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
     def save(self):
         os.makedirs(self.log_dir, exist_ok=True)
         path = os.path.join(self.log_dir, "model_{}.pt".format(self.it))
-        torch.save(
-            {
-                "actor_state_dict": self.alg.actor.state_dict(),
-                "critic_state_dict": self.alg.critic.state_dict(),
-                "optimizer_state_dict": self.alg.optimizer.state_dict(),
-                "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(),
-                "iter": self.it,
-            },
-            path,
-        )
+        save_dict = {
+            "actor_state_dict": self.alg.actor.state_dict(),
+            "critic_state_dict": self.alg.critic.state_dict(),
+            "optimizer_state_dict": self.alg.optimizer.state_dict(),
+            "critic_optimizer_state_dict": self.alg.critic_optimizer.state_dict(),
+            "iter": self.it,
+        }
+
+        if self.train_SE:
+            save_dict.update(
+                {
+                    "SE_state_dict": self.SE.network.state_dict(),
+                    "SE_optimizer_state_dict": self.SE.optimizer.state_dict(),
+                }
+            )
+
+        torch.save(save_dict, path)
 
     def load(self, path, load_optimizer=True):
         loaded_dict = torch.load(path)
@@ -199,17 +258,27 @@ def load(self, path, load_optimizer=True):
                 loaded_dict["critic_optimizer_state_dict"]
             )
         self.it = loaded_dict["iter"]
+        if self.train_SE:
+            self.SE.network.load_state_dict(loaded_dict["SE_state_dict"])
+            self.SE.optimizer.load_state_dict(loaded_dict["SE_optimizer_state_dict"])
 
     def switch_to_eval(self):
         self.alg.actor.eval()
         self.alg.critic.eval()
 
     def get_inference_actions(self):
+        if self.train_SE:
+            SE_obs = self.get_obs(self.train_cfg["state_estimator"]["obs"])
+            self.env.set_states(
+                self.train_cfg["state_estimator"]["write_to"],
+                self.SE.estimate(SE_obs),
+            )
         obs = self.get_noisy_obs(self.actor_cfg["obs"], self.actor_cfg["noise"])
         return self.alg.actor.act_inference(obs)
 
     def export(self, path):
         self.alg.actor.export(path)
+        self.SE.export(path)
 
     def sim_and_log_states(self, states_to_log_dict, it_idx):
         # Simulate environment for as many steps as expected in the dict.

From a09c33574bdc87e362e75004be3eb83f7e7d1e2b Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Tue, 16 Jul 2024 11:17:57 -0400
Subject: [PATCH 2/5] tweaks

---
 gym/envs/mini_cheetah/mini_cheetah_ref.py        | 10 +++++++---
 gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 12 ++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 9cf75fb2..5bbbaee9 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -12,10 +12,8 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
             LEGGED_GYM_ROOT_DIR=LEGGED_GYM_ROOT_DIR
         )
         self.leg_ref = 3 * to_torch(pd.read_csv(csv_path).to_numpy(), device=sim_device)
-        self.omega = 2 * torch.pi * cfg.control.gait_freq
+        # self.omega = 2 * torch.pi * cfg.control.gait_freq
         super().__init__(gym, sim, cfg, sim_params, sim_device, headless)
-        self.se_base_height = torch.zeros_like(self.base_height)
-        self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
     def _init_buffers(self):
         super()._init_buffers()
@@ -25,6 +23,9 @@ def _init_buffers(self):
         self.phase_obs = torch.zeros(
             self.num_envs, 2, dtype=torch.float, device=self.device
         )
+        self.omega = torch.ones(self.num_envs, 1, device=self.device)
+        self.se_base_height = torch.zeros_like(self.base_height)
+        self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
     def _reset_system(self, env_ids):
         super()._reset_system(env_ids)
@@ -32,6 +33,9 @@ def _reset_system(self, env_ids):
             0, torch.pi, shape=self.phase[env_ids].shape, device=self.device
         )
 
+    def _reset_gait_frequencies(self, env_ids):
+        self.omega = torch_rand_float()
+
     def _post_physx_step(self):
         super()._post_physx_step()
         self.phase = (
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 17c60c3a..4680c78d 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -25,7 +25,7 @@ class control(MiniCheetahCfg.control):
         # * PD Drive parameters:
         stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0}
         damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5}
-        gait_freq = 3.0
+        gait_freq = [0.0, 3.0]
         ctrl_frequency = 100
         desired_sim_frequency = 500
 
@@ -95,7 +95,7 @@ class actor(MiniCheetahRunnerCfg.actor):
             "dof_vel",
             "phase_obs",
         ]
-        normalize_obs = True
+        normalize_obs = False
 
         actions = ["dof_pos_target"]
         disable_actions = False
@@ -125,7 +125,7 @@ class critic(MiniCheetahRunnerCfg.critic):
             "phase_obs",
             "dof_pos_target",
         ]
-        normalize_obs = True
+        normalize_obs = False
 
         class reward:
             class weights:
@@ -144,7 +144,7 @@ class weights:
                 dof_pos_limits = 0.0
                 feet_contact_forces = 0.0
                 dof_near_home = 0.0
-                reference_traj = 1.5
+                reference_traj = 0.0
                 swing_grf = 1.5
                 stance_grf = 1.5
 
@@ -168,7 +168,7 @@ class network:
         write_to = ["se_base_height", "se_base_lin_vel"]
         batch_size = 2**15
         max_gradient_steps = 10
-        normalize_obs = True
+        normalize_obs = False
         learning_rate = 1e-4
 
     class algorithm(MiniCheetahRunnerCfg.algorithm):
@@ -177,6 +177,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 1000  # number of policy updates
+        max_iterations = 500  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32  # deprecate

From 937e001a06280aa02943441d09970ab0753165a1 Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Tue, 16 Jul 2024 14:47:58 -0400
Subject: [PATCH 3/5] trot reward based on osc (smoothed with sin)

---
 gym/envs/mini_cheetah/mini_cheetah.py         |  2 ++
 .../mini_cheetah/mini_cheetah_osc_config.py   | 26 +++++++++++++++++--
 gym/envs/mini_cheetah/mini_cheetah_ref.py     | 26 ++++++++++++++++---
 .../mini_cheetah/mini_cheetah_ref_config.py   | 11 ++++----
 4 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah.py b/gym/envs/mini_cheetah/mini_cheetah.py
index bea90140..99cd5bc5 100644
--- a/gym/envs/mini_cheetah/mini_cheetah.py
+++ b/gym/envs/mini_cheetah/mini_cheetah.py
@@ -6,6 +6,8 @@
 class MiniCheetah(LeggedRobot):
     def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         super().__init__(gym, sim, cfg, sim_params, sim_device, headless)
+        self.se_base_height = torch.zeros_like(self.base_height)
+        self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
     def _reward_lin_vel_z(self):
         """Penalize z axis base linear velocity with squared exp"""
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
index 9d686f8e..f994ee88 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -74,7 +74,7 @@ class osc:
         coupling = 1  # 0.02
         osc_bool = False
         grf_bool = False
-        randomize_osc_params = False
+        randomize_osc_params = True
         grf_threshold = 0.1  # 20. # Normalized to body weight
         omega_range = [1.0, 4.0]  # [0.0, 10.]
         coupling_range = [
@@ -170,6 +170,8 @@ class policy:
         smooth_exploration = False
 
         obs = [
+            "base_height",
+            "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -226,6 +228,26 @@ class weights:
             class termination_weight:
                 termination = 15.0 / 100.0
 
+    class state_estimator:
+        class network:
+            hidden_dims = [128, 128]
+            activation = "elu"
+            dropouts = None
+
+        obs = [
+            "base_ang_vel",
+            "projected_gravity",
+            "dof_pos_obs",
+            "dof_vel",
+            "oscillator_obs",
+        ]
+        targets = ["base_height", "base_lin_vel"]
+        write_to = ["se_base_height", "se_base_lin_vel"]
+        batch_size = 2**15
+        max_gradient_steps = 10
+        normalize_obs = True
+        learning_rate = 1e-4
+
     class algorithm(MiniCheetahRunnerCfg.algorithm):
         # training params
         value_loss_coef = 1.0
@@ -245,6 +267,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "FullSend"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 1000  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 5bbbaee9..2d141216 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -29,18 +29,23 @@ def _init_buffers(self):
 
     def _reset_system(self, env_ids):
         super()._reset_system(env_ids)
+        self._reset_gait_frequencies(env_ids)
         self.phase[env_ids] = torch_rand_float(
             0, torch.pi, shape=self.phase[env_ids].shape, device=self.device
         )
 
     def _reset_gait_frequencies(self, env_ids):
-        self.omega = torch_rand_float()
+        self.omega[env_ids, 0] = torch_rand_float(
+            self.cfg.control.gait_freq[0],
+            self.cfg.control.gait_freq[1],
+            (len(env_ids), 1),
+            device=self.device,
+        ).squeeze(1)
 
     def _post_physx_step(self):
         super()._post_physx_step()
-        self.phase = (
-            self.phase + self.dt * self.omega / self.cfg.control.decimation
-        ).fmod(2 * torch.pi)
+        self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation
+        self.phase.fmod(2 * torch.pi)
 
     def _post_decimation_step(self):
         super()._post_decimation_step()
@@ -62,6 +67,19 @@ def _switch(self):
             -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
         )
 
+    def _reward_trot(self):
+        off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi)
+        phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1)
+        grf = self._compute_grf()
+        return (grf * torch.sin(phases)).mean(dim=1) * (1 - self._switch())
+
+    def _compute_grf(self, grf_norm=True):
+        grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
+        if grf_norm:
+            return torch.clamp_max(grf / 80.0, 1.0)
+        else:
+            return grf
+
     def _reward_swing_grf(self):
         """Reward non-zero grf during swing (0 to pi)"""
         in_contact = torch.gt(
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 4680c78d..ab8651d8 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -44,7 +44,7 @@ class asset(MiniCheetahCfg.asset):
             + "mini_cheetah/urdf/mini_cheetah_simple.urdf"
         )
         foot_name = "foot"
-        penalize_contacts_on = ["shank"]
+        penalize_contacts_on = ["shank", "thigh"]
         terminate_after_contacts_on = ["base", "thigh"]
         collapse_fixed_joints = False
         fix_base_link = False
@@ -86,8 +86,8 @@ class actor(MiniCheetahRunnerCfg.actor):
         smooth_exploration = False
         exploration_sample_freq = 16
         obs = [
-            "se_base_height",
-            "se_base_lin_vel",
+            "base_height",
+            "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -145,8 +145,9 @@ class weights:
                 feet_contact_forces = 0.0
                 dof_near_home = 0.0
                 reference_traj = 0.0
-                swing_grf = 1.5
-                stance_grf = 1.5
+                # swing_grf = 1.5
+                # stance_grf = 1.5
+                trot = 1.5
 
             class termination_weight:
                 termination = 0.15

From 0ca782ca7f728de72e98f67c0e89285a2f95049a Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Thu, 18 Jul 2024 14:12:20 -0400
Subject: [PATCH 4/5] several tweaks, learns reasonably well on ref

---
 gym/envs/mini_cheetah/mini_cheetah_config.py  | 21 ++++---
 gym/envs/mini_cheetah/mini_cheetah_osc.py     |  7 ---
 .../mini_cheetah/mini_cheetah_osc_config.py   |  4 +-
 gym/envs/mini_cheetah/mini_cheetah_ref.py     | 62 ++++++++++++++-----
 .../mini_cheetah/mini_cheetah_ref_config.py   | 52 +++++++++++++---
 learning/utils/logger/Logger.py               |  5 +-
 scripts/play.py                               |  5 +-
 7 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index 157c6031..12e4a11f 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -36,21 +36,21 @@ class init_state(LeggedRobotCfg.init_state):
         # * initialization for random range setup
         dof_pos_range = {
             "haa": [-0.01, 0.01],
-            "hfe": [-0.785398, -0.785398],
-            "kfe": [1.596976, 1.596976],
+            "hfe": [-0.885398, -0.585398],
+            "kfe": [1.396976, 1.796976],
         }
-        dof_vel_range = {"haa": [0.0, 0.0], "hfe": [0.0, 0.0], "kfe": [0.0, 0.0]}
+        dof_vel_range = {"haa": [0.0, 0.0], "hfe": [-0.2, 0.2], "kfe": [-0.2, 0.2]}
         root_pos_range = [
             [0.0, 0.0],  # x
             [0.0, 0.0],  # y
             [0.35, 0.35],  # z
-            [0.0, 0.0],  # roll
-            [0.0, 0.0],  # pitch
-            [0.0, 0.0],  # yaw
+            [-0.5, 0.5],  # roll
+            [-0.5, 0.5],  # pitch
+            [-0.5, 0.5],  # yaw
         ]
         root_vel_range = [
-            [-0.5, 2.0],  # x
-            [0.0, 0.0],  # y
+            [-1.0, 1.0],  # x
+            [-0.5, 0.5],  # y
             [-0.05, 0.05],  # z
             [0.0, 0.0],  # roll
             [0.0, 0.0],  # pitch
@@ -67,9 +67,10 @@ class control(LeggedRobotCfg.control):
     class commands:
         # * time before command are changed[s]
         resampling_time = 3.0
+        var = 1.0
 
         class ranges:
-            lin_vel_x = [-2.0, 3.0]  # min max [m/s]
+            lin_vel_x = [-1.0, 0.0, 1.0, 3.0]
             lin_vel_y = 1.0  # max [m/s]
             yaw_vel = 3  # max [rad/s]
 
@@ -82,7 +83,7 @@ class push_robots:
     class domain_rand:
         randomize_friction = True
         friction_range = [0.5, 1.0]
-        randomize_base_mass = False
+        randomize_base_mass = True
         added_mass_range = [-1.0, 1.0]
 
     class asset(LeggedRobotCfg.asset):
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py
index acec0b8a..3f65bca4 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc.py
@@ -266,13 +266,6 @@ def _compute_grf(self, grf_norm=True):
         else:
             return grf
 
-    def _switch(self):
-        c_vel = torch.linalg.norm(self.commands, dim=1)
-        return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
-            / self.cfg.reward_settings.switch_scale
-        )
-
     def _reward_cursorial(self):
         # penalize the abad joints being away from 0
         return -torch.mean(
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
index f994ee88..c432fe59 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -170,8 +170,8 @@ class policy:
         smooth_exploration = False
 
         obs = [
-            "base_height",
-            "base_lin_vel",
+            "se_base_height",
+            "se_base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 2d141216..75f65fca 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -42,6 +42,45 @@ def _reset_gait_frequencies(self, env_ids):
             device=self.device,
         ).squeeze(1)
 
+    def _resample_commands(self, env_ids):
+        """Randommly select commands of some environments
+
+        Args:
+            env_ids (List[int]): Environments ids for which new commands are needed
+        """
+        if len(env_ids) == 0:
+            return
+        super()._resample_commands(env_ids)
+        possible_commands = torch.tensor(
+            self.command_ranges["lin_vel_x"], device=self.device
+        )
+        self.commands[env_ids, 0:1] = possible_commands[
+            torch.randint(
+                0, len(possible_commands), (len(env_ids), 1), device=self.device
+            )
+        ]
+        # add some gaussian noise to the commands
+        self.commands[env_ids, 0:1] += (
+            torch.randn((len(env_ids), 1), device=self.device) * self.cfg.commands.var
+        )
+
+        # if 0 in self.cfg.commands.ranges.lin_vel_x:
+        # * with 20% chance, reset to 0 commands except for forward
+        self.commands[env_ids, 1:] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.8
+        ).unsqueeze(1)
+        # * with 20% chance, reset to 0 commands except for rotation
+        self.commands[env_ids, :2] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.8
+        ).unsqueeze(1)
+        # * with 10% chance, reset to 0
+        # self.commands[env_ids, :] *= (
+        #     torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+        #     < 0.9
+        # ).unsqueeze(1)
+
     def _post_physx_step(self):
         super()._post_physx_step()
         self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation
@@ -53,25 +92,18 @@ def _post_decimation_step(self):
             (torch.sin(self.phase), torch.cos(self.phase)), dim=1
         )
 
-    def _resample_commands(self, env_ids):
-        super()._resample_commands(env_ids)
-        # * with 10% chance, reset to 0 commands
-        rand_ids = torch_rand_float(
-            0, 1, (len(env_ids), 1), device=self.device
-        ).squeeze(1)
-        self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1)
-
     def _switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
         return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
+            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
+            / self.cfg.reward_settings.switch_scale
         )
 
     def _reward_trot(self):
         off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi)
         phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1)
         grf = self._compute_grf()
-        return (grf * torch.sin(phases)).mean(dim=1) * (1 - self._switch())
+        return (grf * torch.sin(phases)).mean(dim=1)  # * (1 - self._switch())
 
     def _compute_grf(self, grf_norm=True):
         grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
@@ -88,7 +120,7 @@ def _reward_swing_grf(self):
         )
         ph_off = torch.lt(self.phase, torch.pi)
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return -torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
 
     def _reward_stance_grf(self):
         """Reward non-zero grf during stance (pi to 2pi)"""
@@ -99,7 +131,7 @@ def _reward_stance_grf(self):
         ph_off = torch.gt(self.phase, torch.pi)  # should this be in swing?
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
 
-        return torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
 
     def _reward_reference_traj(self):
         """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE"""
@@ -108,7 +140,7 @@ def _reward_reference_traj(self):
         error /= self.scales["dof_pos"]
         reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1)
         # * only when commanded velocity is higher
-        return reward * (1 - self._switch())
+        return reward  # * (1 - self._switch())
 
     def _get_ref(self):
         leg_frame = torch.zeros_like(self.torques)
@@ -136,10 +168,10 @@ def _reward_stand_still(self):
         rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
         rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
         rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
-        return (rew_vel + rew_pos - rew_base_vel) * self._switch()
+        return rew_vel + rew_pos - rew_base_vel  # * self._switch()
 
     def _reward_tracking_lin_vel(self):
         """Tracking linear velocity commands (xy axes)"""
         # just use lin_vel?
         reward = super()._reward_tracking_lin_vel()
-        return reward * (1 - self._switch())
+        return reward  # * (1 - self._switch())
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index ab8651d8..41d031d9 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -20,6 +20,7 @@ class init_state(MiniCheetahCfg.init_state):
             "{LEGGED_GYM_ROOT_DIR}/resources/robots/"
             + "mini_cheetah/trajectories/single_leg.csv"
         )
+        reset_mode = "reset_to_range"
 
     class control(MiniCheetahCfg.control):
         # * PD Drive parameters:
@@ -30,7 +31,14 @@ class control(MiniCheetahCfg.control):
         desired_sim_frequency = 500
 
     class commands(MiniCheetahCfg.commands):
-        pass
+        # * time before command are changed[s]
+        resampling_time = 3.0
+        var = 1.0
+
+        class ranges:
+            lin_vel_x = [-1.0, 0.0, 1.0, 3.0]
+            lin_vel_y = 1.0  # max [m/s]
+            yaw_vel = 3  # max [rad/s]
 
     class push_robots(MiniCheetahCfg.push_robots):
         pass
@@ -45,7 +53,7 @@ class asset(MiniCheetahCfg.asset):
         )
         foot_name = "foot"
         penalize_contacts_on = ["shank", "thigh"]
-        terminate_after_contacts_on = ["base", "thigh"]
+        terminate_after_contacts_on = ["base"]
         collapse_fixed_joints = False
         fix_base_link = False
         self_collisions = 1
@@ -58,8 +66,9 @@ class reward_settings(MiniCheetahCfg.reward_settings):
         soft_dof_vel_limit = 0.9
         soft_torque_limit = 0.9
         max_contact_force = 600.0
-        base_height_target = 0.3
+        base_height_target = 0.36
         tracking_sigma = 0.25
+        switch_scale = 0.5
 
     class scaling(MiniCheetahCfg.scaling):
         base_ang_vel = 0.3
@@ -86,14 +95,15 @@ class actor(MiniCheetahRunnerCfg.actor):
         smooth_exploration = False
         exploration_sample_freq = 16
         obs = [
-            "base_height",
-            "base_lin_vel",
+            # "base_height",
+            # "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
             "dof_pos_obs",
             "dof_vel",
             "phase_obs",
+            "omega",
         ]
         normalize_obs = False
 
@@ -101,7 +111,7 @@ class actor(MiniCheetahRunnerCfg.actor):
         disable_actions = False
 
         class noise:
-            scale = 1.0
+            scale = 0.0
             dof_pos_obs = 0.01
             base_ang_vel = 0.01
             dof_pos = 0.005
@@ -123,6 +133,7 @@ class critic(MiniCheetahRunnerCfg.critic):
             "dof_pos_obs",
             "dof_vel",
             "phase_obs",
+            "omega",
             "dof_pos_target",
         ]
         normalize_obs = False
@@ -132,7 +143,7 @@ class weights:
                 tracking_lin_vel = 4.0
                 tracking_ang_vel = 2.0
                 lin_vel_z = 0.0
-                ang_vel_xy = 0.01
+                ang_vel_xy = 0.0
                 orientation = 1.0
                 torques = 5.0e-7
                 dof_vel = 0.0
@@ -141,13 +152,13 @@ class weights:
                 action_rate = 0.01
                 action_rate2 = 0.001
                 stand_still = 0.0
-                dof_pos_limits = 0.0
+                dof_pos_limits = 0.5
                 feet_contact_forces = 0.0
                 dof_near_home = 0.0
                 reference_traj = 0.0
                 # swing_grf = 1.5
                 # stance_grf = 1.5
-                trot = 1.5
+                trot = 3.0
 
             class termination_weight:
                 termination = 0.15
@@ -173,7 +184,28 @@ class network:
         learning_rate = 1e-4
 
     class algorithm(MiniCheetahRunnerCfg.algorithm):
-        pass
+        # both
+        gamma = 0.99
+        lam = 0.95
+        GAE_bootstrap_horizon = 2.0
+        discount_horizon = 1.0
+        # shared
+        batch_size = 2**15
+        max_gradient_steps = 24
+        # new
+        storage_size = 2**17  # new
+
+        clip_param = 0.2
+        learning_rate = 1.0e-5
+        max_grad_norm = 1.0
+        # Critic
+        use_clipped_value_loss = True
+        # Actor
+        entropy_coef = 0.01
+        schedule = "adaptive"  # could be adaptive, fixed
+        desired_kl = 0.01
+        lr_range = [1e-5, 1e-2]
+        lr_ratio = 1.5
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
diff --git a/learning/utils/logger/Logger.py b/learning/utils/logger/Logger.py
index bcaa56a2..90642e48 100644
--- a/learning/utils/logger/Logger.py
+++ b/learning/utils/logger/Logger.py
@@ -79,7 +79,10 @@ def format_log_entry(key, v=None, append=""):
             if v is None:
                 log_string += f"{key:>{pad}}: {append}\n"
             else:
-                log_string += f"{key:>{pad}}: {v:.2f} {append}\n"
+                if abs(v) > 1e-2:
+                    log_string += f"{key:>{pad}}: {v:.2f} {append}\n"
+                else:
+                    log_string += f"{key:>{pad}}: {v:.2e} {append}\n"
 
         def separator(subtitle="", marker="-"):
             nonlocal log_string
diff --git a/scripts/play.py b/scripts/play.py
index 37882f7e..d3e229cb 100644
--- a/scripts/play.py
+++ b/scripts/play.py
@@ -13,11 +13,12 @@ def setup(args):
     if hasattr(env_cfg, "push_robots"):
         env_cfg.push_robots.toggle = False
     if hasattr(env_cfg, "commands"):
-        env_cfg.commands.resampling_time = 9999
+        env_cfg.commands.resampling_time = 9999.0
     env_cfg.env.episode_length_s = 50
     env_cfg.env.num_projectiles = 20
     task_registry.make_gym_and_sim()
-    env_cfg.init_state.reset_mode = "reset_to_range"
+    env_cfg.init_state.reset_mode = "reset_to_basic"
+    # env_cfg.init_state.reset_mode = "reset_to_range"
     env = task_registry.make_env(args.task, env_cfg)
     train_cfg.runner.resume = True
     train_cfg.logging.enable_local_saving = False

From ee0220baab2885bdbd530c85008a3787a527a806 Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Mon, 22 Jul 2024 17:47:03 -0400
Subject: [PATCH 5/5] grf added to internal state of mini_cheetah_ref

---
 gym/envs/mini_cheetah/mini_cheetah_ref.py      | 18 ++++++++++++++++--
 .../mini_cheetah/mini_cheetah_ref_config.py    | 10 ++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
index 75f65fca..908e4ef1 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -24,6 +24,13 @@ def _init_buffers(self):
             self.num_envs, 2, dtype=torch.float, device=self.device
         )
         self.omega = torch.ones(self.num_envs, 1, device=self.device)
+
+        # self.previous_base_lin_vel = torch.zeros_like(self.base_lin_vel)
+        # self.base_lin_acc = torch.zeros_like(self.base_lin_vel)
+
+        self.grf = self._compute_grf()
+        self.se_grf = self._compute_grf()
+
         self.se_base_height = torch.zeros_like(self.base_height)
         self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
@@ -33,6 +40,7 @@ def _reset_system(self, env_ids):
         self.phase[env_ids] = torch_rand_float(
             0, torch.pi, shape=self.phase[env_ids].shape, device=self.device
         )
+        self.se_base_lin_vel[env_ids] = self.base_lin_vel[env_ids]
 
     def _reset_gait_frequencies(self, env_ids):
         self.omega[env_ids, 0] = torch_rand_float(
@@ -86,11 +94,17 @@ def _post_physx_step(self):
         self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation
         self.phase.fmod(2 * torch.pi)
 
+    def _pre_decimation_step(self):
+        super()._pre_decimation_step
+        # self.previous_base_lin_vel = self.base_lin_vel.clone()
+
     def _post_decimation_step(self):
         super()._post_decimation_step()
         self.phase_obs = torch.cat(
             (torch.sin(self.phase), torch.cos(self.phase)), dim=1
         )
+        self.grf = self._compute_grf()
+        # self.base_lin_acc = (self.base_lin_vel - self.previous_base_lin_vel) / self.dt
 
     def _switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
@@ -102,8 +116,8 @@ def _switch(self):
     def _reward_trot(self):
         off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi)
         phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1)
-        grf = self._compute_grf()
-        return (grf * torch.sin(phases)).mean(dim=1)  # * (1 - self._switch())
+        # grf = self._compute_grf()
+        return (self.grf * torch.sin(phases)).mean(dim=1)  # * (1 - self._switch())
 
     def _compute_grf(self, grf_norm=True):
         grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 41d031d9..9a594862 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -82,6 +82,7 @@ class scaling(MiniCheetahCfg.scaling):
         dof_pos_target = 4 * [0.2, 0.3, 0.3]
         tau_ff = 4 * [18, 18, 28]
         commands = [3, 1, 3]
+        torques = 4 * [9.0, 9.0, 14.0]
 
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
@@ -91,7 +92,7 @@ class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     class actor(MiniCheetahRunnerCfg.actor):
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
-        activation = "elu"
+        activation = "tanh"
         smooth_exploration = False
         exploration_sample_freq = 16
         obs = [
@@ -105,7 +106,7 @@ class actor(MiniCheetahRunnerCfg.actor):
             "phase_obs",
             "omega",
         ]
-        normalize_obs = False
+        normalize_obs = True
 
         actions = ["dof_pos_target"]
         disable_actions = False
@@ -174,10 +175,11 @@ class network:
             "projected_gravity",
             "dof_pos_obs",
             "dof_vel",
+            "torques",
             "phase_obs",
         ]
-        targets = ["base_height", "base_lin_vel"]
-        write_to = ["se_base_height", "se_base_lin_vel"]
+        targets = ["base_height", "base_lin_vel", "grf"]
+        write_to = ["se_base_height", "se_base_lin_vel", "se_grf"]
         batch_size = 2**15
         max_gradient_steps = 10
         normalize_obs = False