sheim · lukasmolnar · Jul 15, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 18, 2024
diff --git a/gym/envs/mini_cheetah/mini_cheetah.py b/gym/envs/mini_cheetah/mini_cheetah.py
@@ -6,6 +6,8 @@
 class MiniCheetah(LeggedRobot):
     def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         super().__init__(gym, sim, cfg, sim_params, sim_device, headless)
+        self.se_base_height = torch.zeros_like(self.base_height)
+        self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
     def _reward_lin_vel_z(self):
         """Penalize z axis base linear velocity with squared exp"""

diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -36,21 +36,21 @@ class init_state(LeggedRobotCfg.init_state):
         # * initialization for random range setup
         dof_pos_range = {
             "haa": [-0.01, 0.01],
-            "hfe": [-0.785398, -0.785398],
-            "kfe": [1.596976, 1.596976],
+            "hfe": [-0.885398, -0.585398],
+            "kfe": [1.396976, 1.796976],
         }
-        dof_vel_range = {"haa": [0.0, 0.0], "hfe": [0.0, 0.0], "kfe": [0.0, 0.0]}
+        dof_vel_range = {"haa": [0.0, 0.0], "hfe": [-0.2, 0.2], "kfe": [-0.2, 0.2]}
         root_pos_range = [
             [0.0, 0.0],  # x
             [0.0, 0.0],  # y
             [0.35, 0.35],  # z
-            [0.0, 0.0],  # roll
-            [0.0, 0.0],  # pitch
-            [0.0, 0.0],  # yaw
+            [-0.5, 0.5],  # roll
+            [-0.5, 0.5],  # pitch
+            [-0.5, 0.5],  # yaw
         ]
         root_vel_range = [
-            [-0.5, 2.0],  # x
-            [0.0, 0.0],  # y
+            [-1.0, 1.0],  # x
+            [-0.5, 0.5],  # y
             [-0.05, 0.05],  # z
             [0.0, 0.0],  # roll
             [0.0, 0.0],  # pitch
@@ -67,9 +67,10 @@ class control(LeggedRobotCfg.control):
     class commands:
         # * time before command are changed[s]
         resampling_time = 3.0
+        var = 1.0
 
         class ranges:
-            lin_vel_x = [-2.0, 3.0]  # min max [m/s]
+            lin_vel_x = [-1.0, 0.0, 1.0, 3.0]
             lin_vel_y = 1.0  # max [m/s]
             yaw_vel = 3  # max [rad/s]
 
@@ -82,7 +83,7 @@ class push_robots:
     class domain_rand:
         randomize_friction = True
         friction_range = [0.5, 1.0]
-        randomize_base_mass = False
+        randomize_base_mass = True
         added_mass_range = [-1.0, 1.0]
 
     class asset(LeggedRobotCfg.asset):

diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py
@@ -266,13 +266,6 @@ def _compute_grf(self, grf_norm=True):
         else:
             return grf
 
-    def _switch(self):
-        c_vel = torch.linalg.norm(self.commands, dim=1)
-        return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
-            / self.cfg.reward_settings.switch_scale
-        )
-
     def _reward_cursorial(self):
         # penalize the abad joints being away from 0
         return -torch.mean(

diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -74,7 +74,7 @@ class osc:
         coupling = 1  # 0.02
         osc_bool = False
         grf_bool = False
-        randomize_osc_params = False
+        randomize_osc_params = True
         grf_threshold = 0.1  # 20. # Normalized to body weight
         omega_range = [1.0, 4.0]  # [0.0, 10.]
         coupling_range = [
@@ -170,6 +170,8 @@ class policy:
         smooth_exploration = False
 
         obs = [
+            "se_base_height",
+            "se_base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
@@ -226,6 +228,26 @@ class weights:
             class termination_weight:
                 termination = 15.0 / 100.0
 
+    class state_estimator:
+        class network:
+            hidden_dims = [128, 128]
+            activation = "elu"
+            dropouts = None
+
+        obs = [
+            "base_ang_vel",
+            "projected_gravity",
+            "dof_pos_obs",
+            "dof_vel",
+            "oscillator_obs",
+        ]
+        targets = ["base_height", "base_lin_vel"]
+        write_to = ["se_base_height", "se_base_lin_vel"]
+        batch_size = 2**15
+        max_gradient_steps = 10
+        normalize_obs = True
+        learning_rate = 1e-4
+
     class algorithm(MiniCheetahRunnerCfg.algorithm):
         # training params
         value_loss_coef = 1.0
@@ -245,6 +267,6 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "FullSend"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 1000  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py
@@ -12,7 +12,7 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
             LEGGED_GYM_ROOT_DIR=LEGGED_GYM_ROOT_DIR
         )
         self.leg_ref = 3 * to_torch(pd.read_csv(csv_path).to_numpy(), device=sim_device)
-        self.omega = 2 * torch.pi * cfg.control.gait_freq
+        # self.omega = 2 * torch.pi * cfg.control.gait_freq
         super().__init__(gym, sim, cfg, sim_params, sim_device, headless)
 
     def _init_buffers(self):
@@ -23,39 +23,109 @@ def _init_buffers(self):
         self.phase_obs = torch.zeros(
             self.num_envs, 2, dtype=torch.float, device=self.device
         )
+        self.omega = torch.ones(self.num_envs, 1, device=self.device)
+
+        # self.previous_base_lin_vel = torch.zeros_like(self.base_lin_vel)
+        # self.base_lin_acc = torch.zeros_like(self.base_lin_vel)
+
+        self.grf = self._compute_grf()
+        self.se_grf = self._compute_grf()
+
+        self.se_base_height = torch.zeros_like(self.base_height)
+        self.se_base_lin_vel = torch.zeros_like(self.base_lin_vel)
 
     def _reset_system(self, env_ids):
         super()._reset_system(env_ids)
+        self._reset_gait_frequencies(env_ids)
         self.phase[env_ids] = torch_rand_float(
             0, torch.pi, shape=self.phase[env_ids].shape, device=self.device
         )
+        self.se_base_lin_vel[env_ids] = self.base_lin_vel[env_ids]
+
+    def _reset_gait_frequencies(self, env_ids):
+        self.omega[env_ids, 0] = torch_rand_float(
+            self.cfg.control.gait_freq[0],
+            self.cfg.control.gait_freq[1],
+            (len(env_ids), 1),
+            device=self.device,
+        ).squeeze(1)
+
+    def _resample_commands(self, env_ids):
+        """Randommly select commands of some environments
+
+        Args:
+            env_ids (List[int]): Environments ids for which new commands are needed
+        """
+        if len(env_ids) == 0:
+            return
+        super()._resample_commands(env_ids)
+        possible_commands = torch.tensor(
+            self.command_ranges["lin_vel_x"], device=self.device
+        )
+        self.commands[env_ids, 0:1] = possible_commands[
+            torch.randint(
+                0, len(possible_commands), (len(env_ids), 1), device=self.device
+            )
+        ]
+        # add some gaussian noise to the commands
+        self.commands[env_ids, 0:1] += (
+            torch.randn((len(env_ids), 1), device=self.device) * self.cfg.commands.var
+        )
+
+        # if 0 in self.cfg.commands.ranges.lin_vel_x:
+        # * with 20% chance, reset to 0 commands except for forward
+        self.commands[env_ids, 1:] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.8
+        ).unsqueeze(1)
+        # * with 20% chance, reset to 0 commands except for rotation
+        self.commands[env_ids, :2] *= (
+            torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+            < 0.8
+        ).unsqueeze(1)
+        # * with 10% chance, reset to 0
+        # self.commands[env_ids, :] *= (
+        #     torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1)
+        #     < 0.9
+        # ).unsqueeze(1)
 
     def _post_physx_step(self):
         super()._post_physx_step()
-        self.phase = (
-            self.phase + self.dt * self.omega / self.cfg.control.decimation
-        ).fmod(2 * torch.pi)
+        self.phase += self.dt * 2 * torch.pi * self.omega / self.cfg.control.decimation
+        self.phase.fmod(2 * torch.pi)
+
+    def _pre_decimation_step(self):
+        super()._pre_decimation_step
+        # self.previous_base_lin_vel = self.base_lin_vel.clone()
 
     def _post_decimation_step(self):
         super()._post_decimation_step()
         self.phase_obs = torch.cat(
             (torch.sin(self.phase), torch.cos(self.phase)), dim=1
         )
-
-    def _resample_commands(self, env_ids):
-        super()._resample_commands(env_ids)
-        # * with 10% chance, reset to 0 commands
-        rand_ids = torch_rand_float(
-            0, 1, (len(env_ids), 1), device=self.device
-        ).squeeze(1)
-        self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1)
+        self.grf = self._compute_grf()
+        # self.base_lin_acc = (self.base_lin_vel - self.previous_base_lin_vel) / self.dt
 
     def _switch(self):
         c_vel = torch.linalg.norm(self.commands, dim=1)
         return torch.exp(
-            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1
+            -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1))
+            / self.cfg.reward_settings.switch_scale
         )
 
+    def _reward_trot(self):
+        off_phase = torch.fmod(self.phase + torch.pi, 2 * torch.pi)
+        phases = torch.cat((self.phase, off_phase, off_phase, self.phase), dim=1)
+        # grf = self._compute_grf()
+        return (self.grf * torch.sin(phases)).mean(dim=1)  # * (1 - self._switch())
+
+    def _compute_grf(self, grf_norm=True):
+        grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
+        if grf_norm:
+            return torch.clamp_max(grf / 80.0, 1.0)
+        else:
+            return grf
+
     def _reward_swing_grf(self):
         """Reward non-zero grf during swing (0 to pi)"""
         in_contact = torch.gt(
@@ -64,7 +134,7 @@ def _reward_swing_grf(self):
         )
         ph_off = torch.lt(self.phase, torch.pi)
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
-        return -torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return -torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
 
     def _reward_stance_grf(self):
         """Reward non-zero grf during stance (pi to 2pi)"""
@@ -75,7 +145,7 @@ def _reward_stance_grf(self):
         ph_off = torch.gt(self.phase, torch.pi)  # should this be in swing?
         rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1)
 
-        return torch.sum(rew.float(), dim=1) * (1 - self._switch())
+        return torch.sum(rew.float(), dim=1)  # * (1 - self._switch())
 
     def _reward_reference_traj(self):
         """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE"""
@@ -84,7 +154,7 @@ def _reward_reference_traj(self):
         error /= self.scales["dof_pos"]
         reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1)
         # * only when commanded velocity is higher
-        return reward * (1 - self._switch())
+        return reward  # * (1 - self._switch())
 
     def _get_ref(self):
         leg_frame = torch.zeros_like(self.torques)
@@ -112,10 +182,10 @@ def _reward_stand_still(self):
         rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1)
         rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1)
         rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1)
-        return (rew_vel + rew_pos - rew_base_vel) * self._switch()
+        return rew_vel + rew_pos - rew_base_vel  # * self._switch()
 
     def _reward_tracking_lin_vel(self):
         """Tracking linear velocity commands (xy axes)"""
         # just use lin_vel?
         reward = super()._reward_tracking_lin_vel()
-        return reward * (1 - self._switch())
+        return reward  # * (1 - self._switch())